class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) hard_update(self.actor_target, self.actor_local) hard_update(self.critic_target, self.critic_local) # Noise process self.noise = OUNoise(action_size, random_seed) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def target_act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_target.eval() with torch.no_grad(): action = self.actor_target(state).cpu().data.numpy() self.actor_target.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset()
class DDPG_agent(nn.Module): def __init__(self, in_actor, in_critic, action_size, num_agents, random_seed): super(DDPG_agent, self).__init__() """init the agent""" self.action_size = action_size self.seed = random_seed # Fully connected actor network self.actor_local = Actor(in_actor, self.action_size, self.seed).to(device) self.actor_target = Actor(in_actor, self.action_size, self.seed).to(device) self.actor_optimizer = Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Fully connected critic network self.critic_local = Critic(in_critic, num_agents * self.action_size, self.seed).to(device) self.critic_target = Critic(in_critic, num_agents * self.action_size, self.seed).to(device) self.critic_optimizer = Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck noise process for exploration self.noise = OUNoise((action_size), random_seed) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def target_act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" action = self.actor_target(state) return action def reset(self): """ Resets noise """ self.noise.reset()
class Agent(): def __init__(self, state_size, action_size, random_seed): """ Args: ====== state_size (int): state dim action_size (int): action dim random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # actor net initialization self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # critic net initialization self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck Exploration Noise Process self.noise = OUNoise(action_space=action_size, seed=random_seed) # Replay memory init self.memory = Memory(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, states, actions, rewards, next_states, dones, is_learning_step, saving_wrong_step_prob = 0.9): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): if reward> 0 or random.uniform(0,1) <= saving_wrong_step_prob: self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and is_learning_step: for _ in range(10): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """map action to state""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.evolve_state() return np.clip(action, -1, 1) def act_on_all_agents(self, states): """map action to state to all agents""" vectorized_act = np.vectorize(self.act, excluded='self', signature='(n),()->(k)') return vectorized_act(states, True) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update actor and critic nets parameters Args: ====== experiences (Tuple[torch.Tensor]): experience tuples gamma (float): bellman discount factor """ states, actions, rewards, next_states, dones = experiences # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): #Soft update model parameters for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): def __init__(self, model_name, state_size, action_size, random_seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.model_name = model_name self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.rewards = list() self.losses = deque(maxlen=100) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def sense(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) self.rewards.append(reward) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset_episode(self): self.rewards = list() self.noise.reset() def ave_loss(self): return sum(self.losses) / max(1, len(self.losses)) def cum_rewards(self): return sum(self.rewards) def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.losses.append(actor_loss) # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def load(self): afn = "{}_actor.mdl".format(self.model_name) cfn = "{}_critic.mdl".format(self.model_name) state_dict = torch.load(afn) self.actor_local.load_state_dict(state_dict) state_dict = torch.load(cfn) self.critic_target.load_state_dict(state_dict) log.info("loaded {}, {}".format(afn, cfn)) return self def save(self): afn = "{}_actor.mdl".format(self.model_name) cfn = "{}_critic.mdl".format(self.model_name) torch.save(self.actor_local.state_dict(), afn) torch.save(self.critic_local.state_dict(), cfn) log.info("saved to {}, {}".format(afn, cfn)) return self
class Agent(): """Main DDPG agent that extracts experiences and learns from them""" def __init__(self, state_size, action_size, random_seed): """ Initializes Agent object. @Param: 1. state_size: dimension of each state. 2. action_size: number of actions. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) #Actor network self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #Critic network self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) #Perform hard copy self.hard_copy_weights(self.actor_target, self.actor_local) self.hard_copy_weights(self.critic_target, self.critic_local) #Noise proccess self.noise = OUNoise(action_size, random_seed) #define Ornstein-Uhlenbeck process def reset(self): """Resets the noise process to mean""" self.noise.reset() def act(self, state, add_noise=True): """ Returns a deterministic action given current state. @Param: 1. state: current state, S. 2. add_noise: (bool) add bias to agent, default = True (training mode) """ state = torch.from_numpy(state).float().to( device) #typecast to torch.Tensor self.actor_local.eval() #set in evaluation mode with torch.no_grad(): #reset gradients action = self.actor_local(state).cpu().data.numpy( ) #deterministic action based on Actor's forward pass. self.actor_local.train() #set training mode #If training mode, i.e. add_noise = True, add noise to the model to learn a more accurate policy for current state. if (add_noise): action += self.noise.sample() return np.clip(action, -1, 1) def learn(self, experiences, gamma): """ Learn from a set of experiences picked up from a random sampling of even frequency (not prioritized) of experiences when buffer_size = MINI_BATCH. Updates policy and value parameters accordingly @Param: 1. experiences: (Tuple[torch.Tensor]) set of experiences, trajectory, tau. tuple of (s, a, r, s', done) 2. gamma: immediate reward hyper-parameter, 0.99 by default. """ #Extrapolate experience into (state, action, reward, next_state, done) tuples states, actions, rewards, next_states, dones = experiences #Update Critic network actions_next = self.actor_target( next_states ) # Get predicted next-state actions and Q values from target models Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # r + γ * Q-values(a,s) # Compute critic loss using MSE Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) #clip gradients self.critic_optimizer.step() #Update Actor Network # Compute actor loss actions_pred = self.actor_local(states) #gets mu(s) actor_loss = -self.critic_local(states, actions_pred).mean() #gets V(s,a) # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. Copies model τ every experience. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_copy_weights(self, target, source): """ Copy weights from source to target network @Params: 1. target: copy weights into (destination). 2. source: copy weights from (source). """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # Variables to store best score and scores self.best_score = -np.inf self.score_list = [] def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state self.last_state = next_state # Track rewards self.total_reward += reward self.count += 1 if done: # Average total reward by step counts self.score = self.total_reward / float( self.count) if self.count else 0.0 # Store scores and update the best core self.score_list.append(self.score) if self.score > self.best_score: self.best_score = self.score def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
model.optimizer_critic.zero_grad() # Update loss_value.backward() loss_policy.backward() model.optimizer_critic.step() model.optimizer_actor.step() for target_param, param in zip(model.target_actor.parameters(), model.actor.parameters()): target_param.data.copy_(TAU * param.data + (1.0 - TAU) * target_param.data) for target_param, param in zip( model.target_critic.parameters(), model.critic.parameters()): target_param.data.copy_(TAU * param.data + (1.0 - TAU) * target_param.data) if done: state = env.reset() noise.reset() print(rewards) rewards = 0 break train_epoch += 1 if train_epoch % 10 == 0: early_stop = test(model, train_epoch // 10)
class DDPGAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """ Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # for MADDPG self.num_agents = num_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) self.eps = EPS_START self.eps_decay = 1 / (EPS_EP_END * LEARN_NUM) self.timestep = 0 # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, agent_number): """Save experience in replay memory, and use random sample from buffer to learn.""" self.timestep += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at learning interval settings if len(self.memory) > BATCH_SIZE and self.timestep % LEARN_EVERY == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_number) def act(self, states, add_noise): """Returns actions for both agents as per current policy, given their respective states.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): # For MADDPG: get action for each agent and concatenate them for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() if add_noise: actions += self.noise.sample() actions = np.clip(actions, -1, 1) return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Construct next actions vector relative to the agent if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) # Compute Q targets for current states (y_i) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # For MADDPG: Construct action vector for each agent actions_pred = self.actor_local(states) if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) # Compute actor loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update noise decay parameter #self.eps -= self.eps_decay #self.eps = max(self.eps, EPS_FINAL) #self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters.""" for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_checkpoint(self, agent_number, filename='checkpoint'): checkpoint = { 'action_size': self.action_size, 'state_size': self.state_size, 'actor_state_dict': self.actor_local.state_dict(), 'critic_state_dict': self.critic_local.state_dict() } filepath = filename + '_' + str(agent_number) + '.pth' torch.save(checkpoint, filepath) print(filepath + ' succesfully saved.') def load_checkpoint(self, agent_number, filename='checkpoint'): filepath = filename + '_' + str(agent_number) + '.pth' checkpoint = torch.load(filepath) state_size = checkpoint['state_size'] action_size = checkpoint['action_size'] self.actor_local = Actor(state_size, action_size, seed=42).to(device) self.critic_local = Critic(state_size, action_size, seed=42).to(device) self.actor_local.load_state_dict(checkpoint['actor_state_dict']) self.critic_local.load_state_dict(checkpoint['critic_state_dict']) print(filepath + ' successfully loaded.')
class Agent(object): def __init__(self, task, hp): self.task = task self.nb_states = task.state_size self.nb_actions = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.action_range = self.action_high - self.action_low # why not use bits to represent continous action space as discrete actions :) self.action_bits = 8 # np.floor( np.log2(self.action_range) + 1 ) self.action_size = (self.nb_actions * self.action_bits ) #.astype(np.int) self.action_factor = self.action_high / 2**self.action_bits self.use_cuda = 1 if hp['USE_CUDA'] is True else 0 if int(hp['SEED']) > 0: self.seed(hp['SEED']) self.buffer_size = hp['EXP_BUFFER_SIZE'] self.batch_size = hp['EXP_BATCH_SIZE'] # Create Actor and Critic Network net_cfg = { 'hidden1': hp['HIDDEN1'], 'hidden2': hp['HIDDEN2'], 'init_w': hp['INIT_W'] } self.actor = Actor(self.nb_states, self.action_size, **net_cfg) self.actor_target = Actor(self.nb_states, self.action_size, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=hp['ACTOR_LR']) self.critic = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=hp['CRITIC_LR']) self.hard_copy(self.actor, self.actor_target) self.hard_copy(self.critic, self.critic_target) # Create experience memory buffer self.memory = ExperienceMemory(self.buffer_size, self.batch_size) # init the process of life ... .. self.random_process = OUNoise(size=self.nb_actions, theta=hp['OU_THETA'], mu=hp['OU_MU'], sigma=hp['OU_SIGMA']) self.ou_decay = hp['OU_DECAY'] # Hyper-parameters #self.batch_size = hp.BATCH_SIZE self.tau = hp['TAU'] self.discount = hp['DISCOUNT'] #self.depsilon = 1.0 / args.epsilon # #self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True # nvidia if hp['USE_CUDA']: self.cuda() def hard_copy(self, source, target): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def soft_update(self, source, target): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) def update_policy(self): # Get Sample batches state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.batch_samples(self.batch_size) #state_batch = state_batch / 360 #next_state_batch = next_state_batch / 360 #print(action_batch) ########################################### # Prepare for the target q batch with torch.no_grad(): # no grad calc next_actions = [] for action in self.actor_target(to_tensor(next_state_batch)): #print(action) action = to_numpy(action) next_actions.append(np.array(self.action_transform(action))) next_q_values = self.critic_target([ to_tensor(next_state_batch), to_tensor(np.array(next_actions)) ]) # Q_targets = (rewards + self.gamma * Q_targets_next.reshape(len(experiences)) * (1 - dones)) target_q_batch = to_tensor(reward_batch) + \ self.discount*to_tensor(1 - terminal_batch.astype(np.float))*next_q_values ############################################ # Critic update self.critic.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) #print("vloss:",value_loss) value_loss.backward() self.critic_optim.step() ############################################## # Actor update self.actor.zero_grad() next_actions = [] for action in self.actor_target(to_tensor(state_batch)): #print(action) action = to_numpy(action) next_actions.append(np.array(self.action_transform(action))) policy_loss = -self.critic( [to_tensor(state_batch), to_tensor(np.array(next_actions))]) policy_loss = policy_loss.mean() #print("ploss:",policy_loss) policy_loss.backward() self.actor_optim.step() ############################################### # Target update self.soft_update(self.actor, self.actor_target) self.soft_update(self.critic, self.critic_target) return None, None #value_loss.detach().squeeze().numpy() ,policy_loss.detach().squeeze().numpy() def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() self.is_training = False def action_transform(self, action): # this dependes on our output activation function action[action <= 0.] = 0 action[action > 0.] = 1 action = np.array(np.split(action, self.nb_actions)).astype(np.bool) action = np.packbits(action).astype(np.float) #, axis=-1) action = action * self.action_factor return action def cuda(self): self.use_cuda = 1 self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def step(self, action, reward, next_state, done): # Save experience / reward self.a_t = action self.observe(reward, next_state, done) # If we got our minibatch of experience memories.. # learn from them and slowly change belief :) aloss = None ploss = None if len(self.memory) > self.batch_size: aloss, ploss = self.update_policy() return aloss, ploss def observe(self, r_t, s_t1, done): if self.is_training: self.memory.add(self.s_t, self.a_t, r_t, s_t1, done) self.s_t = s_t1 def random_action(self): action = np.random.uniform(0., 900., self.nb_actions) self.a_t = action return action def act(self, s_t, i_episode=0, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) #action = (action * self.action_range) + self.action_low action = self.action_transform(action) #np.packbits(a, axis=-1) if (self.ou_decay != 0): decay = 1 - (i_episode * self.ou_decay) #print(action, decay) action += self.is_training * decay * self.random_process.sample() self.a_t = action return action def reset(self): self.s_t = self.task.reset() self.random_process.reset() return self.s_t def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output)) def seed(self, s): torch.manual_seed(s) if self.use_cuda: torch.cuda.manual_seed(s)
class DDPG(): """ Reinforcement Learning agent that learns using DDPG. """ def __init__( self, task, actor_params={}, critic_params={}, noise_params={}, replay_memory_params={}, algo_params = {} ): # Default Params default_actor_params = {'lr': .001} default_critic_params= {'lr': .001} default_noise_params= {'mu': 0, 'theta': .15, 'sigma': .2} default_replay_memory_params= {'buffer_size': 100000, 'batch_size': 64} default_algo_params = {'gamma': .99, 'tau': .1} # Final Params final_actor_params= {**default_actor_params, **actor_params} final_critic_params={**default_critic_params, **critic_params} final_noise_params={**default_noise_params, **noise_params} final_replay_memory_params={**default_replay_memory_params, **replay_memory_params, } final_algo_params = {**default_algo_params, **algo_params} self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, final_actor_params) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, final_actor_params) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, final_critic_params) self.critic_target = Critic(self.state_size, self.action_size, final_critic_params) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.noise = OUNoise( self.action_size, final_noise_params['mu'], final_noise_params['theta'], final_noise_params['sigma'] ) # Replay memory self.batch_size = final_replay_memory_params['batch_size'] self.memory = ReplayBuffer( final_replay_memory_params['buffer_size'], final_replay_memory_params['batch_size'] ) # Algorithm parameters self.gamma = final_algo_params['gamma'] # discount factor self.tau = final_algo_params['tau'] # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): """Returns actions for given state(s) as per current policy.""" state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, env): """Class initialization.""" self.env = env self.state_size = env.observation_space.shape[0] self.action_size = env.action_space.shape[0] self.action_low = env.action_space.high[0] self.action_high = env.action_space.low[0] # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters def reset(self): """Start a new episode.""" self.noise.reset() state = self.env.reset() self.last_state = state return state def step(self, action, reward, next_state, done): """Save in experience buffer and batch learn from buffer step. Save the action, reward, next_state in the experience buffer and if the buffer has enough samples to satisfy the batch size then make a learning step. """ # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory # if len(self.memory) > self.batch_size: if len(self.memory) > self.batch_size * 50: experiences = self.memory.sample() loss_critic = self.learn(experiences) else: loss_critic = None # Roll over last state and action self.last_state = next_state return loss_critic def act(self, state): """Return actions for given state(s) as per current policy. Also add some noise to the action (control-command) to explore the space. """ state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] # add some noise for exploration return list(action + self.noise.sample()) def learn(self, experiences): """Update policy and value parameters. Use given batch of experience tuples from the experience buffer. """ # Convert experience tuples to separate arrays for each element (states, # actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from (target) models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) loss_critic = self.critic_local.model.train_on_batch( x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) # Customized actor training function self.actor_local.train_fn([states, action_gradients, 1]) # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) return loss_critic def soft_update(self, local_model, target_model): """Soft update model parameters. Update the target model with the local model weights. Do so gradually by using a soft update parameter, tau. Note ---- After training over a batch of experiences, we could just copy our newly learned weights (from the local model) to the target model. However, individual batches can introduce a lot of variance into the process, so it's better to perform a soft update, controlled by the parameter tau. """ local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), \ "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + \ (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class MADDPGAgent(object): """Multi Agent DDPG Implementation Paper: https://arxiv.org/abs/1706.02275 I used their code to understand how the agents were implemented https://github.com/openai/maddpg """ def __init__(self, state_size, action_size, num_agents, agent_index, writer, random_seed, dirname, print_every=1000, model_path=None, saved_config=None, eval_mode=False): """Initialize an Agent object. Parameters: state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents agent_index (int): index (id) of current agent writer (object): visdom visualiser for realtime visualisations random_seed (int): random seed dirname (string): output directory to store config, losses print_every (int): how often to print progress model_path (string): if defined, load saved model to resume training eval_mode (bool): whether to use eval mode """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.agent_index = agent_index self.writer = writer self.dirname = dirname self.print_every = print_every # save config params if not saved_config: self.config = CONFIG save_to_json(self.config, '{}/hyperparams.json'.format(self.dirname)) else: self.config = json.load(open(saved_config, 'r')) logger.info( 'Loading config from saved location {}'.format(saved_config)) # Create Critic network self.local_critic = Critic(self.state_size * num_agents, self.action_size * num_agents, random_seed, fc1_units=self.config['FC1'], fc2_units=self.config['FC2']).to(device) self.target_critic = Critic(self.state_size * num_agents, self.action_size * num_agents, random_seed, fc1_units=self.config['FC1'], fc2_units=self.config['FC2']).to(device) # Optimizer self.critic_optimizer = optim.Adam( self.local_critic.parameters(), lr=self.config['LR_CRITIC'], weight_decay=self.config['WEIGHT_DECAY']) # Create Actor network self.local_actor = Actor(self.state_size, self.action_size, random_seed, fc1_units=self.config['FC1'], fc2_units=self.config['FC2']).to(device) self.target_actor = Actor(self.state_size, self.action_size, random_seed, fc1_units=self.config['FC1'], fc2_units=self.config['FC2']).to(device) self.actor_optimizer = optim.Adam(self.local_actor.parameters(), lr=self.config['LR_ACTOR']) # Load saved model (if available) if model_path: logger.info('Loading model from {}'.format(model_path)) self.local_actor.load_state_dict( torch.load('{}/checkpoint_actor_{}.pth'.format( model_path, self.agent_index))) self.target_actor.load_state_dict( torch.load('{}/checkpoint_actor_{}.pth'.format( model_path, self.agent_index))) self.local_critic.load_state_dict( torch.load('{}/checkpoint_critic_{}.pth'.format( model_path, self.agent_index))) self.target_critic.load_state_dict( torch.load('{}/checkpoint_critic_{}.pth'.format( model_path, self.agent_index))) if eval_mode: logger.info('agent {} set to eval mode') self.actor_local.eval() self.noise = OUNoise(self.action_size, random_seed, sigma=self.config['SIGMA']) self.learn_step = 0 def act(self, state, add_noise=True, noise_weight=1): """Get the actions to take under the supplied states Parameters: state (array_like): Game state provided by the environment add_noise (bool): Whether we should apply the noise noise_weight (int): How much weight should be applied to the noise """ state = torch.from_numpy(state).float().to(device) # Run inference in eval mode self.local_actor.eval() with torch.no_grad(): action = self.local_actor(state).cpu().data.numpy() self.local_actor.train() # add noise if true if add_noise: action += self.noise.sample() * noise_weight return np.clip(action, -1, 1) def reset(self): """Resets the noise""" self.noise.reset() def learn(self, agents, experience, gamma): """Use the experience to allow agents to learn. The critic of each agent can see the actions taken by all agents and incorporate that in the learning. Parameters: agents (MADDPGAgent): instance of all the agents experience (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ num_agents = len(agents) states, actions, rewards, next_states, dones = experience # ---------------central critic------------------- # use target actor to get action, here we get target actors from # all agents to predict the next action next_actions = torch.zeros( (len(states), num_agents, self.action_size)).to(device) for i, agent in enumerate(agents): next_actions[:, i] = agent.target_actor(states[:, i, :]) # Flatten state and action # e.g from state (100,2,24) --> (100, 48) critic_states = flatten(next_states) next_actions = flatten(next_actions) # calculate target and expected Q_targets_next = self.target_critic(critic_states, next_actions) Q_targets = rewards[:, self.agent_index, :] + ( gamma * Q_targets_next * (1 - dones[:, self.agent_index, :])) Q_expected = self.local_critic(flatten(states), flatten(actions)) # use mse loss critic_loss = F.mse_loss(Q_expected, Q_targets) critic_loss_value = critic_loss.item() self.critic_optimizer.zero_grad() critic_loss.backward() if self.config['CLIP_GRADS']: for param in self.local_critic.parameters(): param.grad.data.clamp_(-1 * self.config['CLAMP_VALUE'], self.config['CLAMP_VALUE']) self.critic_optimizer.step() # ---------------actor--------------------- # Only update the predicted action of current agent predicted_actions = torch.zeros( (len(states), num_agents, self.action_size)).to(device) predicted_actions.data.copy_(actions.data) predicted_actions[:, self.agent_index] = self.local_actor( states[:, self.agent_index]) actor_loss = -self.local_critic(flatten(states), flatten(predicted_actions)).mean() # Kept to remind myself about the mistake that several tooks hours of investigation # and was only found when I looked at grads from self.local_actor.parameters() # actor_loss = -self.local_critic(flatten(states), flatten(actions)).mean() actor_loss_value = actor_loss.item() self.actor_optimizer.zero_grad() actor_loss.backward() if self.config['CLIP_GRADS']: for param in self.local_actor.parameters(): # import pdb; pdb.set_trace() param.grad.data.clamp_(-1 * self.config['CLAMP_VALUE'], self.config['CLAMP_VALUE']) self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # if self.learn_step == 0: # One time only, start local and target with same parameters self._copy_weights(self.local_critic, self.target_critic) self._copy_weights(self.local_actor, self.target_actor) else: self.soft_update(self.local_critic, self.target_critic, self.config["TAU"]) self.soft_update(self.local_actor, self.target_actor, self.config["TAU"]) self.learn_step += 1 return actor_loss_value, critic_loss_value def _copy_weights(self, source_network, target_network): """Copy source network weights to target""" for target_param, source_param in zip(target_network.parameters(), source_network.parameters()): target_param.data.copy_(source_param.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def checkpoint(self): """Checkpoint actor and critic models""" if not os.path.exists('{}/multi'.format(self.dirname)): os.makedirs('{}/multi'.format(self.dirname)) torch.save( self.local_critic.state_dict(), '{}/multi/checkpoint_critic_{}.pth'.format(self.dirname, self.agent_index)) torch.save( self.local_actor.state_dict(), '{}/multi/checkpoint_actor_{}.pth'.format(self.dirname, self.agent_index))
class Agent: """Interacts with and learns from the environment. (see the README for an explanation of the various hyperparameters) """ def __init__(self, state_size: int, action_size: int, agent_no: int, params: dict): """Initialize an Agent object. Args: state_size: dimension of each state action_size: dimension of each action agent_no: agent id params: architecture and hyperparameters """ self.state_size = state_size self.action_size = action_size self.seed = params['agent_seed'] self.batch_size = params['batch_size'] self.lr_actor = params['lr_actor'] self.lr_critic = params['lr_critic'] self.critic_weight_decay = params['critic_weight_decay'] self.gamma = params['gamma'] self.tau = params['tau'] self.update_step = params['update_step'] self.num_agents = params['num_agents'] random.seed(self.seed) self.t_step = 0 self.agent_no = agent_no # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, params['first_hidden_units'], params['second_hidden_units'], self.seed).to(device) self.actor_target = Actor(state_size, action_size, params['first_hidden_units'], params['second_hidden_units'], self.seed).to(device) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * self.num_agents, action_size * self.num_agents, params['first_hidden_units'], params['second_hidden_units'], self.seed).to(device) self.critic_target = Critic(state_size * self.num_agents, action_size * self.num_agents, params['first_hidden_units'], params['second_hidden_units'], self.seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.critic_weight_decay) # Noise process self.noise = OUNoise(action_size, self.seed, sigma=params['noise_sigma']) def step(self, memory: object, agents: Dict[int, object]): """Save experience in replay memory, and use random sample from buffer to learn every `update_step` if there are enough samples in the buffer to form a batch Args: memory: fixed-size buffer to store experience tuples agents: object references to Agent instances within the environment """ self.t_step += 1 if (len(memory) >= self.batch_size) & (self.t_step % self.update_step == 0): agents_experiences = memory.sample() self.learn(agents_experiences, agents) def act(self, state: np.array, add_noise: bool = True, scale: float = 1.0) -> np.array: """Returns actions for given state as per current policy. Args: state: add_noise: whether to add noise to actions for exploration during training or not (for evaluation) scale: noise scaling parameter Returrns: action: clipped action of the agent """ state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() * scale np.clip(action, -1, 1) return action def reset(self): self.noise.reset() def learn(self, experiences: Dict[int, Tuple[torch.tensor, torch.tensor, torch.tensor, torch.tensor, torch.tensor]], agents: Dict[int, object]): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Args: experiences: dictionary with agent-specific tuples with five tensors each that comprise states, actions, rewards, next_states, and dones batch_wise following exactly that order, i.e. tensor objects of size (`batch_size`, dim) where dim is `state_size` for states and next_states, `action_size` for actions, and 1 for rewards and dones agents: object references to Agent instances within the environment """ self_rewards = experiences[self.agent_no][2] self_dones = experiences[self.agent_no][4] joint_next_states = torch.cat( [experiences[no][3] for no in range(self.num_agents)], dim=1) # compute actions_next applying ea. agents target policy # on its next_states observations joint_actions_next = torch.cat([ agents[no].actor_target(experiences[no][3]) for no in range(self.num_agents) ], dim=1) # --------------------------- update critic ---------------------------- # # compute the Q_targets (y) using the agent's target critic network with # on the next_states observations of all agents and joint_actions_next Q_targets_next = self.critic_target(joint_next_states, joint_actions_next) Q_targets = self_rewards + (self.gamma * Q_targets_next * (1 - self_dones)) joint_states = torch.cat( [experiences[no][0] for no in range(self.num_agents)], dim=1) joint_actions = torch.cat( [experiences[no][1] for no in range(self.num_agents)], dim=1) # compute Q_expected applying the local critic to joint state observations # and all agents' actions Q_expected = self.critic_local(joint_states, joint_actions) # Compute critic loss critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # joint_actions_pred = torch.cat([ agents[no].actor_local(experiences[no][0]) for no in range(self.num_agents) ], dim=1) # Compute actor loss actor_loss = -self.critic_local(joint_states, joint_actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Args: local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class Agent(): """Main DDPG agent that extracts experiences and learns from them""" def __init__(self, state_size=24, action_size=2, random_seed=0): """ Initializes Agent object. @Param: 1. state_size: dimension of each state. 2. action_size: number of actions. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) #Actor network self.actor_local = Actor(self.state_size, self.action_size, random_seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #Critic network self.critic_local = Critic(self.state_size, self.action_size, random_seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) #Noise proccess self.noise = OUNoise(action_size, random_seed) #define Ornstein-Uhlenbeck process #Replay memory self.memory = ReplayBuffer( self.action_size, BUFFER_SIZE, MINI_BATCH, random_seed) #define experience replay buffer object def step(self, time_step, state, action, reward, next_state, done): """ Saves an experience in the replay memory to learn from using random sampling. @Param: 1. state: current state, S. 2. action: action taken based on current state. 3. reward: immediate reward from state, action. 4. next_state: next state, S', from action, a. 5. done: (bool) has the episode terminated? Exracted version for trajectory used in calculating the value for an action, a.""" self.memory.add(state, action, reward, next_state, done) #append to memory buffer # only learn every n_time_steps if time_step % N_TIME_STEPS != 0: return #check if enough samples in buffer. if so, learn from experiences, otherwise, keep collecting samples. if (len(self.memory) > MINI_BATCH): for _ in range(N_LEARN_UPDATES): experience = self.memory.sample() self.learn(experience) def reset(self): """Resets the noise process to mean""" self.noise.reset() def act(self, state, add_noise=True): """ Returns a deterministic action given current state. @Param: 1. state: current state, S. 2. add_noise: (bool) add bias to agent, default = True (training mode) """ state = torch.from_numpy(state).float().unsqueeze(0).to( device) #typecast to torch.Tensor self.actor_local.eval() #set in evaluation mode with torch.no_grad(): #reset gradients action = self.actor_local(state).cpu().data.numpy( ) #deterministic action based on Actor's forward pass. self.actor_local.train() #set training mode #If training mode, i.e. add_noise = True, add noise to the model to learn a more accurate policy for current state. if (add_noise): action += self.noise.sample() return np.clip(action, -1, 1) def learn(self, experiences, gamma=GAMMA): """ Learn from a set of experiences picked up from a random sampling of even frequency (not prioritized) of experiences when buffer_size = MINI_BATCH. Updates policy and value parameters accordingly @Param: 1. experiences: (Tuple[torch.Tensor]) set of experiences, trajectory, tau. tuple of (s, a, r, s', done) 2. gamma: immediate reward hyper-parameter, 0.99 by default. """ #Extrapolate experience into (state, action, reward, next_state, done) tuples states, actions, rewards, next_states, dones = experiences #Update Critic network actions_next = self.actor_target( next_states ) # Get predicted next-state actions and Q values from target models Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # r + γ * Q-values(a,s) # Compute critic loss using MSE Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) #clip gradients self.critic_optimizer.step() #Update Actor Network # Compute actor loss actions_pred = self.actor_local(states) #gets mu(s) actor_loss = -self.critic_local(states, actions_pred).mean() #gets V(s,a) # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. Copies model τ every experience. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG(): """ Deep Deterministic Policy Gradients Agent used to interaction with and learn from an environment """ def __init__(self, state_size: int, action_size: int, num_agents: int, epsilon, random_seed: int): """ Initialize a DDPG Agent Object :param state_size: dimension of state (input) :param action_size: dimension of action (output) :param num_agents: number of concurrent agents in the environment :param epsilon: initial value of epsilon for exploration :param random_seed: random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.t_step = 0 # Hyperparameters self.buffer_size = 1000000 self.batch_size = 128 self.update_every = 10 self.num_updates = 10 self.gamma = 0.99 self.tau = 0.001 self.lr_actor = 0.0001 self.lr_critic = 0.001 self.weight_decay = 0 self.epsilon = epsilon self.epsilon_decay = 0.97 self.epsilon_min = 0.005 # Networks (Actor: State -> Action, Critic: (State,Action) -> Value) self.actor_local = Actor(self.state_size, self.action_size, random_seed).to(self.device) self.actor_target = Actor(self.state_size, self.action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) self.critic_local = Critic(self.state_size, self.action_size, random_seed).to(self.device) self.critic_target = Critic(self.state_size, self.action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay) # Initialize actor and critic networks to start with same parameters self.soft_update(self.actor_local, self.actor_target, tau=1) self.soft_update(self.critic_local, self.critic_target, tau=1) # Noise Setup self.noise = OUNoise(self.action_size, random_seed) # Replay Buffer Setup self.memory = ReplayBuffer(self.buffer_size, self.batch_size) def __str__(self): return "DDPG_Agent" def train(self, env, brain_name, num_episodes=200, max_time=1000, print_every=10): """ Interacts with and learns from a given Unity Environment :param env: Unity Environment the agents is trying to learn :param brain_name: Brain for Environment :param num_episodes: Number of episodes to train :param max_time: How long each episode runs for :param print_every: How often in episodes to print a running average :return: Returns episodes scores and 100 episode averages as lists """ # --------- Set Everything up --------# scores = [] avg_scores = [] scores_deque = deque(maxlen=print_every) # -------- Simulation Loop --------# for episode_num in range(1, num_episodes + 1): # Reset everything env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations episode_scores = np.zeros(self.num_agents) self.reset_noise() # Run the episode for t in range(max_time): actions = self.act(states, self.epsilon) env_info = env.step(actions)[brain_name] next_states, rewards, dones = env_info.vector_observations, env_info.rewards, env_info.local_done self.step(states, actions, rewards, next_states, dones) episode_scores += rewards states = next_states if np.any(dones): break # -------- Episode Finished ---------# self.epsilon *= self.epsilon_decay self.epsilon = max(self.epsilon, self.epsilon_min) scores.append(np.mean(episode_scores)) scores_deque.append(np.mean(episode_scores)) avg_scores.append(np.mean(scores_deque)) if episode_num % print_every == 0: print( f'Episode: {episode_num} \tAverage Score: {round(np.mean(scores_deque), 2)}' ) torch.save( self.actor_local.state_dict(), f'{PATH}\checkpoints\{self.__str__()}_Actor_Multiple.pth') torch.save( self.critic_local.state_dict(), f'{PATH}\checkpoints\{self.__str__()}_Critic_Multiple.pth') # -------- All Episodes finished Save parameters and scores --------# # Save Model Parameters torch.save(self.actor_local.state_dict(), f'{PATH}\checkpoints\{self.__str__()}_Actor_Multiple.pth') torch.save(self.critic_local.state_dict(), f'{PATH}\checkpoints\{self.__str__()}_Critic_Multiple.pth') # Save mean score per episode (of the 20 agents) f = open(f'{PATH}\scores\{self.__str__()}_Multiple_Scores.txt', 'w') scores_string = "\n".join([str(score) for score in scores]) f.write(scores_string) f.close() # Save average scores for 100 window average f = open(f'{PATH}\scores\{self.__str__()}_Multiple_AvgScores.txt', 'w') avgScores_string = "\n".join([str(score) for score in avg_scores]) f.write(avgScores_string) f.close() return scores, avg_scores def step(self, states, actions, rewards, next_states, dones): """ what the agent needs to do for every time step that occurs in the environment. Takes in a (s,a,r,s',d) tuple and saves it to memeory and learns from experiences. Note: this is not the same as a step in the environment. Step is only called once per environment time step. :param states: array of states agent used to select actions :param actions: array of actions taken by agents :param rewards: array of rewards for last action taken in environment :param next_states: array of next states after actions were taken :param dones: array of bools representing if environment is finished or not """ # Save experienced in replay memory for agent_num in range(self.num_agents): self.memory.add(states[agent_num], actions[agent_num], rewards[agent_num], next_states[agent_num], dones[agent_num]) # Learn "num_updates" times every "update_every" time step self.t_step += 1 if len(self.memory ) > self.batch_size and self.t_step % self.update_every == 0: self.t_step = 0 for _ in range(self.num_updates): experiences = self.memory.sample() self.learn(experiences) def act(self, states, epsilon, add_noise=True): """ Returns actions for given states as per current policy. Policy comes from the actor network. :param states: array of states from the environment :param epsilon: probability of exploration :param add_noise: bool on whether or not to potentially have exploration for action :return: clipped actions """ states = torch.from_numpy(states).float().to(self.device) self.actor_local.eval() # Sets to eval mode (no gradients) with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() # Sets to train mode (gradients back on) if add_noise and epsilon > np.random.random(): actions += [self.noise.sample() for _ in range(self.num_agents)] return np.clip(actions, -1, 1) def reset_noise(self): """ resets to noise parameters """ self.noise.reset() def learn(self, experiences): """ Update actor and critic networks using a given batch of experiences Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(states) -> actions critic_target(states, actions) -> Q-value :param experiences: tuple of arrays (states, actions, rewards, next_states, dones) sampled from the replay buffer """ states, actions, rewards, next_states, dones = experiences # -------------------- Update Critic -------------------- # # Use target networks for getting next actions and q values and calculate q_targets next_actions = self.actor_target(next_states) next_q_targets = self.critic_target(next_states, next_actions) q_targets = rewards + (self.gamma * next_q_targets * (1 - dones)) # Compute critic loss (Same as DQN Loss) q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(q_expected, q_targets) # Minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # -------------------- Update Actor --------------------- # # Computer actor loss (maximize mean of Q(states,actions)) action_preds = self.actor_local(states) # Optimizer minimizes and we want to maximize so multiply by -1 actor_loss = -1 * self.critic_local(states, action_preds).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() #---------------- Update Target Networks ---------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_network, target_network, tau): """ soft update newtwork parametes θ_target = τ*θ_local + (1 - τ)*θ_target :param local_network: PyTorch Network that is always up to date :param target_network: PyTorch Network that is not up to date :param tau: update (interpolation) parameter """ for target_param, local_param in zip(target_network.parameters(), local_network.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed=0, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, gamma=GAMMA, checkpoint_path='./checkpoints/', pretrained=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.gamma = gamma self.checkpoint_path = checkpoint_path # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) # If pretrained, load weights if pretrained: actor_dict = torch.load(os.path.join(self.checkpoint_path,'checkpoint_actor.pth')) critic_dict = torch.load(os.path.join(self.checkpoint_path,'checkpoint_critic.pth')) self.actor_local.load_state_dict(actor_dict) self.actor_target.load_state_dict(actor_dict) self.critic_local.load_state_dict(critic_dict) self.critic_target.load_state_dict(critic_dict) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) def step(self, state, action, reward, next_state, done, tstep=LEARN_EVERY+1): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and tstep % LEARN_EVERY == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences) def train(self, env, n_episodes=1000): """Deep Deterministic Policy Gradient (DDPG) Learning. Params ====== env (UnityEnvironment): Unity environment n_episodes (int): maximum number of training episodes """ # create checkpoints folder if necessary if not os.path.exists(self.checkpoint_path): os.makedirs(self.checkpoint_path) # get the default brain brain_name = env.brain_names[0] env_info = env.reset(train_mode=True)[brain_name] num_agents = len(env_info.agents) # last 100 scores scores_deque = deque(maxlen=100) # list containing scores from each episode all_scores = [] # list containing window averaged scores avg_scores = [] # for each episode for i_episode in range(1, n_episodes+1): # reset environment env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations # reset noise self.reset() scores = np.zeros(num_agents) # for each timepoint t=0 while True: # agent action actions = self.act(states) # get the next state env_info = env.step(actions)[brain_name] next_states = env_info.vector_observations # get the reward rewards = env_info.rewards # see if episode has ended dones = env_info.local_done # step for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones): self.step(state, action, reward, next_state, done, t) states = next_states scores += rewards t+=1 if np.any(dones): break # save most recent score max_score = np.max(scores) scores_deque.append(max_score) all_scores.append(max_score) avg_scores.append(np.mean(scores_deque)) print('\rEpisode {}\tScore: {:.2f}\tMax Score: {:.2f}'.format(i_episode, max_score, np.mean(scores_deque)), end="") if i_episode % 50 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque))) if np.mean(scores_deque)>=0.5: print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque))) torch.save(self.actor_local.state_dict(), self.checkpoint_path+'checkpoint_actor.pth') torch.save(self.critic_local.state_dict(), self.checkpoint_path+'checkpoint_critic.pth') break return all_scores, avg_scores def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) self.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def play(self, env, n_episodes=5): """Play a few episodes with trained agents. Params ====== env (UnityEnvironment): Unity environment n_episodes (int): maximum number of training episodes """ # get the default brain brain_name = env.brain_names[0] brain = env.brains[brain_name] # reset the environment env_info = env.reset(train_mode=False)[brain_name] num_agents = len(env_info.agents) action_size = brain.vector_action_space_size state_size = env_info.vector_observations.shape[1] # for each episode for i_episode in range(1, n_episodes+1): env_info = env.reset(train_mode=False)[brain_name] states = env_info.vector_observations self.reset() # set the noise to zero score = np.zeros(num_agents) while(True): actions = self.act(states, add_noise=False) env_info = env.step(actions)[brain_name] # get the next states next_states = env_info.vector_observations # get the rewards rewards = env_info.rewards # see if the episode has finished for any agent dones = env_info.local_done self.step(states, actions, rewards, next_states, dones) states = next_states score += rewards if np.any(dones): break print('Best Score:', np.max(score)) env.close()
class AgentDDPG(): def __init__(self, env): """ :param task: (class instance) Instructions about the goal and reward """ self.env = env self.state_size = env.observation_space.shape[0] self.action_size = env.action_space.shape[0] self.action_low = env.action_space.low self.action_high = env.action_space.high self.score = 0.0 self.best = 0.0 # Instances of the policy function or actor and the value function or critic # Actor critic with Advantage # Actor local and target self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Save actor model for future use actor_local_model_yaml = self.actor_local.model.to_yaml() with open("actor_local_model.yaml", "w") as yaml_file: yaml_file.write(actor_local_model_yaml) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic local and target self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model with local model self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Initialize the Gaussin Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Initialize the Replay Memory self.buffer_size = 100000 self.batch_size = 64 # original 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Parameters for the Algorithm self.gamma = 0.99 # Discount factor self.tau = 0.01 # Soft update for target parameters Actor Critic with Advantage # Actor can reset the episode def reset_episode(self): # Your total reward goes to 0 same as your count self.total_reward = 0.0 self.count = 0 # Reset the gaussian noise self.noise.reset() # Gets a new state from the task state = self.env.reset() # Protect the state obtained from the task # by storing it as last state self.last_state = state # Return the state obtained from task return state # Actor interact with the environment def step(self, action, reward, next_state, done): # Add to the total reward the reward of this time step self.total_reward += reward # Increase your count based on the number of rewards # received in the episode self.count += 1 # Stored previous state in the replay buffer self.memory.add(self.last_state, action, reward, next_state, done) # Check to see if you have enough to produce a batch # and learn from it if len(self.memory) > self.batch_size: experiences = self.memory.sample() # Train the networks using the experiences self.learn(experiences) # Roll over last state action self.last_state = next_state # Actor determines what to do based on the policy def act(self, state): # Given a state return the action recommended by the policy # Reshape the state to fit the keras model input state = np.reshape(state, newshape=[-1, self.state_size]) # Pass the state to the actor local model to get an action # recommend for the policy in a state action = self.actor_local.model.predict(state)[0] # Because we are exploring we add some noise to the # action vector return list(action + self.noise.sample()) # This is the Actor learning logic called when the agent # take a step to learn def learn(self, experiences): """ Learning means that the networks parameters needs to be updated Using the experineces batch. Network learns from experiences not form interaction with the environment """ # Reshape the experience tuples in separate arrays of states, actions # rewards, next_state, done # Your are converting every memeber of the tuple in a column or vector states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Firs we pass a batch of next states to the actor so it tell us what actions # to execute, we use the actor target network instead of the actor local network # because of the advantage principle actions_next = self.actor_target.model.predict_on_batch(next_states) # The critic evaluates the actions taking by the actor and generates the # Q(a,s) value of those actions. This action, state tuple comes from the # ReplayBuffer not from interacting with the environment. # Remember the Critic or value function inputs is states, actions Q_targets_next = self.critic_target.model.predict_on_batch( ([next_states, actions_next])) # With the Q_targets_next that is a vector of action values Q(s,a) of a random selected # next_states from the replay buffer. We calculate the target Q(s,a). # For that we use the TD one-step Sarsa equations # We make terminal states target Q(s,a) 0 and Non terminal the Q_targtes value # This is done to train the critic in a supervise learning fashion. Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train the actor action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # Custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) def get_episode_score(self): """ Calculate the episode scores :return: None """ # Update score and best score self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best: self.best = self.score def save_model_weights(self, actor_model): actor_model.model.save_weights('weights.h5')
class DDPG(): """ This is an Individual DDPG Agent """ def __init__(self, state_size, action_size, seed): """ Initialize a DDPG Agent Object :param state_size: dimension of state (input) for this decentralized actor :param action_size: dimension of action (output) for this decentralized actor :param random_seed: random seed """ self.state_size = state_size self.action_size = action_size self.seed = seed self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Hyperparameters self.buffer_size = 100000 self.batch_size = 256 self.gamma = 0.99 self.tau = 0.01 self.lr_actor = 0.0001 self.lr_critic = 0.001 # Setup Networks (Actor: State -> Action, Critic: (States for all agents, Actions for all agents) -> Value) self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(self.device) self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = self.lr_actor) self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(self.device) self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr = self.lr_critic) # Initialize local and taret networks to start with same parameters self.soft_update(self.actor_local, self.actor_target, tau=1) self.soft_update(self.critic_local, self.critic_target, tau=1) # Noise Setup self.noise = OUNoise(self.action_size, self.seed) # Replay Buffer Setup self.memory = ReplayBuffer(self.buffer_size, self.batch_size) def __str__(self): return "DDPG_Agent" def reset_noise(self): """ resets to noise parameters """ self.noise.reset() def act(self, state, epsilon, add_noise=True): """ Returns actions for given states as per current policy. Policy comes from the actor network. :param state: observations for this individual agent :param epsilon: probability of exploration :param add_noise: bool on whether or not to potentially have exploration for action :return: clipped actions """ state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise and epsilon > np.random.random(): actions += self.noise.sample() return np.clip(actions, -1,1) def step(self): if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): """ Update actor and critic networks using a given batch of experiences Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(states) -> actions critic_target(states, actions) -> Q-value :param experiences: tuple of arrays (states, actions, rewards, next_states, dones) sampled from the replay buffer """ states, actions, rewards, next_states, dones = experiences # -------------------- Update Critic -------------------- # # Use target networks for getting next actions and q values and calculate q_targets next_actions = self.actor_target(next_states) next_q_targets = self.critic_target(next_states, next_actions) q_targets = rewards + (self.gamma * next_q_targets * (1 - dones)) # Compute critic loss (Same as DQN Loss) q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(q_expected, q_targets) # Minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # -------------------- Update Actor --------------------- # # Computer actor loss (maximize mean of Q(states,actions)) action_preds = self.actor_local(states) # Optimizer minimizes and we want to maximize so multiply by -1 actor_loss = -1 * self.critic_local(states, action_preds).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ---------------- Update Target Networks ---------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_network, target_network, tau): """ soft update newtwork parametes θ_target = τ*θ_local + (1 - τ)*θ_target :param local_network: PyTorch Network that is always up to date :param target_network: PyTorch Network that is not up to date :param tau: update (interpolation) parameter """ for target_param, local_param in zip(target_network.parameters(), local_network.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.timestep = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, agent_num): """Save experience in replay memory, and use random sample from buffer to learn.""" self.timestep += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_num) def act(self, states, eps, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() if add_noise: actions += eps * self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_num): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # -------------------------- update critic -------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) if agent_num == 0: actions_next = torch.cat((actions_next, actions[:, :2]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # -------------------------- update actor -------------------------- # # Compute actor loss actions_pred = self.actor_local(states) if agent_num == 0: actions_pred = torch.cat((actions_pred, actions[:, :2]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # --------------------- update target networks --------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG_Agent: def __init__(self, ob_sp, act_sp, alow, ahigh, writer, args): self.args = args self.alow = alow self.ahigh = ahigh self.policy = Policy_net(ob_sp, act_sp) self.policy_targ = Policy_net(ob_sp, act_sp) self.qnet = Q_net(ob_sp, act_sp) self.qnet_targ = Q_net(ob_sp, act_sp) self.policy.to(device) self.qnet.to(device) self.policy_targ.to(device) self.qnet_targ.to(device) self.MSE_loss = nn.MSELoss() self.noise = OUNoise(1, 1) hard_update(self.policy_targ, self.policy) hard_update(self.qnet_targ, self.qnet) self.p_optimizer = optim.Adam(self.policy.parameters(), lr=LR) self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=LR) self.memory = ReplayMemory(int(1e6)) self.epsilon_scheduler = LinearSchedule(E_GREEDY_STEPS, FINAL_STD, INITIAL_STD, warmup_steps=WARMUP_STEPS) self.n_steps = 0 self.n_updates = 0 self.writer = writer def get_action(self, state): if self.args.use_ounoise: noise = self.noise.sample()[0] else: noise = np.random.normal( 0, self.epsilon_scheduler.value(self.n_steps)) st = torch.from_numpy(state).view(1, -1).float() action = self.policy(st) action_with_noise = np.clip(action.item() + noise, self.alow, self.ahigh) if self.args.use_writer: self.writer.add_scalar("action mean", action.item(), self.n_steps) self.writer.add_scalar("action noise", noise, self.n_steps) self.writer.add_scalar("epsilon", self.epsilon_scheduler.value(self.n_steps), self.n_steps) self.writer.add_scalar("action", action_with_noise, self.n_steps) self.n_steps += 1 return action_with_noise def store_transition(self, state, action, reward, next_state, done): self.memory.push(torch.from_numpy(state), torch.tensor(action), torch.tensor(reward), torch.from_numpy(next_state), torch.tensor(done)) def reset(self): self.noise.reset() def train(self): batch = self.memory.sample(min(BATCH_SIZE, len(self.memory))) b_dict = [torch.stack(elem) for elem in Transition(*zip(*batch))] states, actions, rewards, next_states, dones = \ b_dict[0], b_dict[1].view(-1, 1), \ b_dict[2].view(-1, 1).float().to(device), b_dict[3], \ b_dict[4].view(-1, 1).float().to(device) # CRITIC LOSS: Q(s, a) += (r + gamma*Q'(s, π'(s)) - Q(s, a)) # inputs computation inputs_critic = self.qnet(states, actions) # targets with torch.no_grad(): policy_acts = self.policy_targ(next_states) targ_values = self.qnet_targ(next_states, policy_acts) targets_critics = rewards + GAMMA * (1 - dones) * targ_values loss_critic = self.MSE_loss(inputs_critic, targets_critics) self.q_optimizer.zero_grad() loss_critic.backward() # nn.utils.clip_grad_norm_(self.qnet.parameters(), GRAD_CLIP) self.q_optimizer.step() # ACTOR objective: derivative of Q(s, π(s | ø)) with respect to ø actor_loss = -self.qnet(states, self.policy(states)).mean() self.p_optimizer.zero_grad() actor_loss.backward() # nn.utils.clip_grad_norm_(self.policy.parameters(), GRAD_CLIP) self.p_optimizer.step() soft_update(self.policy_targ, self.policy, TAU) soft_update(self.qnet_targ, self.qnet, TAU) if self.args.use_writer: self.writer.add_scalar("critic_loss", loss_critic.item(), self.n_updates) self.writer.add_scalar("actor_loss", actor_loss.item(), self.n_updates) self.n_updates += 1
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.name = "DDPG" self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, 'actor_local') self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, 'actor_target') # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, 'critic_local') self.critic_target = Critic(self.state_size, self.action_size, 'critic_target') # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters # Reward counter self.total_reward = 0 self.n_steps = 0 def load(self): self.actor_local.load() self.actor_target.load() self.critic_local.load() self.critic_target.load() print("Agent's weights loaded from disk.") def save(self): self.actor_local.save() self.actor_target.save() self.critic_local.save() self.critic_target.save() print("Agent's weights saved to disk.") def reset_episode(self): self.total_reward = 0 self.n_steps = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Add reward to total self.total_reward += reward self.n_steps += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state, add_noise=True): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] # Hack, rescale rotor revs to +-5 range from average # rev_mean = np.mean(action) # action = (action-450)/450 # action *= 50 # action += rev_mean if add_noise: action += self.noise.sample() # additive noise for exploration return list(action) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class MultiDDPGAgent: """ Multi-agent DDPG implementation.""" def __init__(self, state_size, action_size, num_agents, cfg): """Initialize a MADDPG Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): Number of agents in environment cfg (config object): main configuration with other settings """ print("Initializing MADDPG agent with {:d} agents!".format(num_agents)) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(cfg.random_seed) self.cfg = cfg # initializing list of single agents (2 for tennis) self.agents = [] for aid in range(num_agents): agent = SingleDDPGAgent(state_size, action_size, cfg, num_agents=num_agents, agent_id=aid) self.agents.append(agent) self.t_step = 0 # Noise process self.noise_scale = self.cfg.noise_scale self.noise = OUNoise(action_size, cfg.random_seed, theta=cfg.theta_ou, sigma=cfg.sigma_ou) # as long as active, will fill replay buffer with random memories, no learning self.prefetching = True # Replay memory for shared experiences (all agents) self.memory = ReplayBuffer(action_size, cfg.buffer_size, cfg.batch_size, cfg.random_seed, cfg) def add_noise(self): if self.cfg.use_ou: return self.noise_scale * self.noise.sample() else: # Gaussian noise return self.noise_scale * np.random.normal(0, 1.0, self.action_size) def reset(self): self.t_step = 0 self.noise.reset() def act(self, state_all, add_noise=True): """ Let all agents act. Receives full state tensor of all agents and outputs all actions (num_agents x action_size). """ actions = [] for aid in range(self.num_agents): # only add noise after pre-loading memories noise = 0 if not self.prefetching and add_noise: noise = self.add_noise() actions.append( self.agents[aid].act(state_all[aid], add_noise=False) + noise) return actions def _target_act(self, states_all): """ Internal function used by learn function. Gets target network actions for all agents. """ target_actions = [] for aid in range(self.num_agents): # states_all format (batch size, num_agents, state size) target_actions.append(self.agents[aid].target_act( states_all[:, aid, :])) return target_actions def step(self, states, actions, rewards, next_states, dones): """ Save experiences in global memory. If memory large enough, use it to learn each agent. """ max_prio = self.memory.get_max_priority() self.memory.add(states, actions, rewards, next_states, max_prio, dones) # start training if memory size large enough. if len(self.memory) >= max(self.cfg.batch_size, self.cfg.init_replay): if self.prefetching: self.prefetching = False print("Pre-loading of memories complete, starting training!") else: return self.t_step = (self.t_step + 1) % self.cfg.learn_every if self.t_step == 0: for _ in range(self.cfg.learn_steps): self.learn_all() self.noise_scale = max(self.noise_scale * self.cfg.noise_decay, self.cfg.noise_scale_min) self.t_step += 1 def learn_all(self): """Generates full batch input and performs individual learning steps.""" samples = self.memory.sample() for aid in range(self.num_agents): self.learn(samples, aid) self.soft_update_all() def learn(self, samples, agent_number): """ Update critic and actor networks of given agent using provided samples from replay memory. """ # from memory states, actions, rewards, next_states, priorities, dones, indices = samples # creating full states and next_states with shape (batch_size, -1) batch_size = self.cfg.batch_size full_states = states.view(batch_size, -1) full_next_states = next_states.view(batch_size, -1) # selecting the correct agent agent = self.agents[agent_number] # 1. Update of critic agent.critic_optimizer.zero_grad() # critic loss = TD-error, so batch mean of (y- Q*(s,a))^2 # y = current reward + discount * Q*(st+1,at+1) from target network Q* # shape (batch_size, num_agents, -1) target_actions = torch.cat(self._target_act( next_states.view(batch_size, self.num_agents, -1)), dim=1) # returns list, so change to shape (batch_size, action_size, num_agent) # get next q values from target critic q_next = agent.critic_target(full_next_states, target_actions.to(device)) y = rewards[:, agent_number].view(-1, 1) + \ self.cfg.gamma * q_next * (1 - dones[:, agent_number].view(-1, 1)) q = agent.critic_local(full_states, actions.view(batch_size, -1)) critic_loss = None if self.cfg.loss_l == 1: huber_loss = torch.nn.SmoothL1Loss() critic_loss = huber_loss(q, y.detach()) elif self.cfg.loss_l == 2: critic_loss = F.mse_loss(q, y.detach()) else: AssertionError("L{:d} loss is not supported!".format( self.cfg.loss_l)) # optimization of critic (local) loss critic_loss.backward() torch.nn.utils.clip_grad_norm_(agent.critic_local.parameters(), 1) agent.critic_optimizer.step() # 2. Update of actor network using policy gradient agent.actor_optimizer.zero_grad() # make input to agent # detach the other agents to save computation # saves some time for computing derivative q_input = [ self.agents[i].actor_local( states.view(batch_size, self.num_agents, -1)[:, i, :]) if i == agent_number else self.agents[i].actor_local( states.view(batch_size, self.num_agents, -1)[:, i, :]).detach() for i in range(self.num_agents) ] q_input = torch.cat(q_input, dim=1) # combine all the actions and observations for input to critic # many of the obs are redundant, and obs[1] contains all useful information already # get the actual policy gradient here actor_loss = -agent.critic_local(full_states, q_input).mean() # optimize actor_loss.backward() torch.nn.utils.clip_grad_norm_(agent.actor_local.parameters(), 1) agent.actor_optimizer.step() # soft update the models agent.soft_update(agent.critic_local, agent.critic_target, self.cfg.tau) agent.soft_update(agent.actor_local, agent.actor_target, self.cfg.tau) def soft_update_all(self): """soft update targets""" for agent in self.agents: agent.soft_update(agent.critic_local, agent.critic_target, self.cfg.tau) agent.soft_update(agent.actor_local, agent.actor_target, self.cfg.tau) def save_weights(self, model_save_path): """ Simple method to save network weights. """ for aid, agent in enumerate(self.agents): agent.save_weights(model_save_path, suffix="_{:d}".format(aid)) def load_weights(self, model_save_path): """ Method to load network weights from saved files. """ for aid, agent in enumerate(self.agents): agent.load_weights(model_save_path, suffix="_{:d}".format(aid))
class DDPGAgent(object): """ General class for DDPG agents (policy, critic, target policy, target critic, exploration noise) """ def __init__(self, num_in_pol, num_out_pol, num_in_critic, hidden_dim_actor=120, hidden_dim_critic=64,lr_actor=0.01,lr_critic=0.01,batch_size=64, max_episode_len=100,tau=0.02,gamma = 0.99,agent_name='one', discrete_action=False): """ Inputs: num_in_pol (int): number of dimensions for policy input num_out_pol (int): number of dimensions for policy output num_in_critic (int): number of dimensions for critic input """ self.policy = Actor(num_in_pol, num_out_pol, hidden_dim=hidden_dim_actor, discrete_action=discrete_action) self.critic = Critic(num_in_pol, 1,num_out_pol, hidden_dim=hidden_dim_critic) self.target_policy = Actor(num_in_pol, num_out_pol, hidden_dim=hidden_dim_actor, discrete_action=discrete_action) self.target_critic = Critic(num_in_pol, 1,num_out_pol, hidden_dim=hidden_dim_critic) hard_update(self.target_policy, self.policy) hard_update(self.target_critic, self.critic) self.policy_optimizer = Adam(self.policy.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic,weight_decay=0) self.policy = self.policy.float() self.critic = self.critic.float() self.target_policy = self.target_policy.float() self.target_critic = self.target_critic.float() self.agent_name = agent_name self.gamma = gamma self.tau = tau self.batch_size = batch_size #self.replay_buffer = ReplayBuffer(1e7) self.replay_buffer = ReplayBufferOption(500000,self.batch_size,12) self.max_replay_buffer_len = batch_size * max_episode_len self.replay_sample_index = None self.niter = 0 self.eps = 5.0 self.eps_decay = 1/(250*5) self.exploration = OUNoise(num_out_pol) self.discrete_action = discrete_action self.num_history = 2 self.states = [] self.actions = [] self.rewards = [] self.next_states = [] self.dones = [] def reset_noise(self): if not self.discrete_action: self.exploration.reset() def scale_noise(self, scale): if self.discrete_action: self.exploration = scale else: self.exploration.scale = scale def act(self, obs, explore=False): """ Take a step forward in environment for a minibatch of observations Inputs: obs : Observations for this agent explore (boolean): Whether or not to add exploration noise Outputs: action (PyTorch Variable): Actions for this agent """ #obs = obs.reshape(1,48) state = Variable(torch.Tensor(obs),requires_grad=False) self.policy.eval() with torch.no_grad(): action = self.policy(state) self.policy.train() # continuous action if explore: action += Variable(Tensor(self.eps * self.exploration.sample()),requires_grad=False) action = torch.clamp(action, min=-1, max=1) return action def step(self, agent_id, state, action, reward, next_state, done,t_step): self.states.append(state) self.actions.append(action) self.rewards.append(reward) self.next_states.append(next_state) self.dones.append(done) #self.replay_buffer.add(state, action, reward, next_state, done) if t_step % self.num_history == 0: # Save experience / reward self.replay_buffer.add(self.states, self.actions, self.rewards, self.next_states, self.dones) self.states = [] self.actions = [] self.rewards = [] self.next_states = [] self.dones = [] # Learn, if enough samples are available in memory if len(self.replay_buffer) > self.batch_size: obs, acs, rews, next_obs, don = self.replay_buffer.sample() self.update(agent_id ,obs, acs, rews, next_obs, don,t_step) def update(self, agent_id, obs, acs, rews, next_obs, dones ,t_step, logger=None): obs = torch.from_numpy(obs).float() acs = torch.from_numpy(acs).float() rews = torch.from_numpy(rews[:,agent_id]).float() next_obs = torch.from_numpy(next_obs).float() dones = torch.from_numpy(dones[:,agent_id]).float() acs = acs.view(-1,2) # --------- update critic ------------ # self.critic_optimizer.zero_grad() all_trgt_acs = self.target_policy(next_obs) target_value = (rews + self.gamma * self.target_critic(next_obs,all_trgt_acs) * (1 - dones)) actual_value = self.critic(obs,acs) vf_loss = MSELoss(actual_value, target_value.detach()) # Minimize the loss vf_loss.backward() #torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1) self.critic_optimizer.step() # --------- update actor --------------- # self.policy_optimizer.zero_grad() if self.discrete_action: curr_pol_out = self.policy(obs) curr_pol_vf_in = gumbel_softmax(curr_pol_out, hard=True) else: curr_pol_out = self.policy(obs) curr_pol_vf_in = curr_pol_out pol_loss = -self.critic(obs,curr_pol_vf_in).mean() #pol_loss += (curr_pol_out**2).mean() * 1e-3 pol_loss.backward() torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 1) self.policy_optimizer.step() self.update_all_targets() self.eps -= self.eps_decay self.eps = max(self.eps, 0) if logger is not None: logger.add_scalars('agent%i/losses' % self.agent_name, {'vf_loss': vf_loss, 'pol_loss': pol_loss}, self.niter) def update_all_targets(self): """ Update all target networks (called after normal updates have been performed for each agent) """ soft_update(self.critic, self.target_critic, self.tau) soft_update(self.policy, self.target_policy, self.tau) def get_params(self): return {'policy': self.policy.state_dict(), 'critic': self.critic.state_dict(), 'target_policy': self.target_policy.state_dict(), 'target_critic': self.target_critic.state_dict(), 'policy_optimizer': self.policy_optimizer.state_dict(), 'critic_optimizer': self.critic_optimizer.state_dict()} def load_params(self, params): self.policy.load_state_dict(params['policy']) self.critic.load_state_dict(params['critic']) self.target_policy.load_state_dict(params['target_policy']) self.target_critic.load_state_dict(params['target_critic']) self.policy_optimizer.load_state_dict(params['policy_optimizer']) self.critic_optimizer.load_state_dict(params['critic_optimizer'])
class SingleDDPGAgent: """ Single agent DDPG. Interacts with and learns from the environment. """ def __init__(self, state_size, action_size, cfg, num_agents=1, agent_id=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action cfg (config object): main configuration with other passed settings num_agents (int): optional (default: 1). If >1 will multiply state and action space sizes for critic. Used for usage with MADDPG. agent_id (int): optional (default: 0). Set agent id for MADDPG. """ print("Initializing single DDPG agent!") self.state_size = state_size self.action_size = action_size self.seed = random.seed(cfg.random_seed) self.n_agents = num_agents self.agent_id = agent_id self.cfg = cfg # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, cfg.random_seed, cfg.dense_layers_actor).to(device) self.actor_target = Actor(state_size, action_size, cfg.random_seed, cfg.dense_layers_actor).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=cfg.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * num_agents, action_size * num_agents, cfg.random_seed, cfg.dense_layers_critic).to(device) self.critic_target = Critic(state_size * num_agents, action_size * num_agents, cfg.random_seed, cfg.dense_layers_critic).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=cfg.lr_critic, weight_decay=cfg.weight_decay) self.hard_copy_weights(self.critic_local, self.critic_target) self.hard_copy_weights(self.actor_local, self.actor_target) self.t_step = 0 # Noise process self.noise = OUNoise(action_size, cfg.random_seed, theta=cfg.theta_ou, sigma=cfg.sigma_ou) # Replay memory self.memory = ReplayBuffer(action_size, cfg.buffer_size, cfg.batch_size, cfg.random_seed, cfg) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward max_prio = self.memory.get_max_priority() self.memory.add(state, action, reward, next_state, max_prio, done) # Learn, if enough samples are available in memory # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % self.cfg.update_every if self.t_step == 0: if len(self.memory) > self.cfg.batch_size: experiences = self.memory.sample() self.learn(experiences, self.cfg.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state.view( 1, -1)).squeeze().cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def target_act(self, state): """ Let target network return action.""" self.actor_target.eval() with torch.no_grad(): action_target = self.actor_target(state) return np.clip(action_target, -1, 1) def reset(self): self.t_step = 0 self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', prio, done, indices) tuples gamma (float): discount factor """ states, actions, rewards, next_states, priorities, dones, indices = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) if self.cfg.prioritized_replay: weights = 1. / ( (self.cfg.batch_size * priorities)**self.cfg.priority_beta) weights /= max(weights) # calculating new transition priorities based on residuals # between target and local network predictions diffs = Q_targets - Q_expected # TD-error diffs = np.abs(np.squeeze(diffs.tolist())) self.memory.update_prios(indices, diffs) # bias-annealing weights Q_expected *= weights Q_targets *= weights critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.cfg.tau) self.soft_update(self.actor_local, self.actor_target, self.cfg.tau) @staticmethod def hard_copy_weights(local_model, target_model): """Update model parameters. Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_weights(self, model_save_path, suffix=""): """ Simple method to save network weights. """ # actors torch.save( self.actor_local.state_dict(), os.path.join(model_save_path, "weights_actor_local{:s}.pth".format(suffix))) torch.save( self.actor_target.state_dict(), os.path.join(model_save_path, "weights_actor_target{:s}.pth".format(suffix))) # critics torch.save( self.critic_local.state_dict(), os.path.join(model_save_path, "weights_critic_local{:s}.pth".format(suffix))) torch.save( self.critic_target.state_dict(), os.path.join(model_save_path, "weights_critic_target{:s}.pth".format(suffix))) def load_weights(self, model_save_path, suffix=""): """ Method to load network weights from saved files. """ self.actor_local.load_state_dict( torch.load( os.path.join(model_save_path, "weights_actor_local{:s}.pth".format(suffix)))) self.actor_target.load_state_dict( torch.load( os.path.join(model_save_path, "weights_actor_target{:s}.pth".format(suffix)))) self.critic_local.load_state_dict( torch.load( os.path.join(model_save_path, "weights_critic_local{:s}.pth".format(suffix)))) self.critic_target.load_state_dict( torch.load( os.path.join(model_save_path, "weights_critic_target{:s}.pth".format(suffix))))
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, buffer_size=int(1e5), batch_size=256, learn_every=1, update_every=1, gamma=0.99, tau=0.02, lr_actor=2e-4, lr_critic=2e-3, random_seed=None, use_asn=True, asn_kwargs={}, use_psn=False, psn_kwargs={}, use_per=False, restore=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.update_every = update_every self.learn_every = learn_every self.batch_size = batch_size self.gamma = gamma self.tau = tau # Keep track of how many times we've updated weights self.i_updates = 0 self.i_step = 0 self.use_asn = use_asn self.use_psn = use_psn self.use_per = use_per if random_seed is not None: random.seed(random_seed) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) if self.use_psn: self.actor_perturbed = Actor(state_size, action_size).to(device) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) # restore networks if needed if restore is not None: checkpoint = torch.load(restore, map_location=device) self.actor_local.load_state_dict(checkpoint[0]['actor']) self.actor_target.load_state_dict(checkpoint[0]['actor']) if self.use_psn: self.actor_perturbed.load_state_dict(checkpoint[0]['actor']) self.critic_local.load_state_dict(checkpoint[0]['critic']) self.critic_target.load_state_dict(checkpoint[0]['critic']) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic) # Hard copy weights from local to target networks policy_update(self.actor_local, self.actor_target, 1.0) policy_update(self.critic_local, self.critic_target, 1.0) # Noise process if self.use_asn: self.action_noise = OUNoise(action_size, **asn_kwargs) if self.use_psn: self.param_noise = ParameterSpaceNoise(**psn_kwargs) if self.use_per: self.buffer = PrioritizedExperienceReplay(buffer_size, batch_size, random_seed) else: self.buffer = ExperienceReplay(buffer_size, batch_size, random_seed) def act(self, states, perturb_mode=True, train_mode=True): """Returns actions for given state as per current policy.""" if not train_mode: self.actor_local.eval() if self.use_psn: self.actor_perturbed.eval() with torch.no_grad(): states = torch.from_numpy(states).float().to(device) actor = self.actor_perturbed if ( self.use_psn and perturb_mode) else self.actor_local actions = actor(states).cpu().numpy()[0] if train_mode: actions += self.action_noise.sample() self.actor_local.train() if self.use_psn: self.actor_perturbed.train() return np.clip(actions, -1, 1) def perturb_actor_parameters(self): """Apply parameter space noise to actor model, for exploration""" policy_update(self.actor_local, self.actor_perturbed, 1.0) params = self.actor_perturbed.state_dict() for name in params: if 'ln' in name: pass param = params[name] random = torch.randn(param.shape) if use_cuda: random = random.cuda() param += random * self.param_noise.current_stddev def reset(self): self.action_noise.reset() if self.use_psn: self.perturb_actor_parameters() def step(self, experience, priority=0.0): self.buffer.push(experience) self.i_step += 1 if len(self.buffer) > self.batch_size: if self.i_step % self.learn_every == 0: self.learn(priority) if self.i_step % self.update_every == 0: self.update( ) # soft update the target network towards the actual networks def learn(self, priority=0.0): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ if self.use_per: (states, actions, rewards, states_next, dones), batch_idx = self.buffer.sample(priority) else: states, actions, rewards, states_next, dones = self.buffer.sample() # Get predicted next-state actions and Q values from target models with torch.no_grad(): actions_next = self.actor_target(states_next) Q_targets_next = self.critic_target(states_next, actions_next) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # ---------------------------- update critic ---------------------------- # # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.smooth_l1_loss(Q_expected, Q_targets) # Minimize the loss self.critic_local.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_local.zero_grad() actor_loss.backward() self.actor_optimizer.step() if self.use_per: Q_error = Q_expected - Q_targets new_deltas = torch.abs(Q_error.detach().squeeze(1)).numpy() self.buffer.update_deltas(batch_idx, new_deltas) def update(self): """soft update targets""" self.i_updates += 1 policy_update(self.actor_local, self.actor_target, self.tau) policy_update(self.critic_local, self.critic_target, self.tau) def save_model(self, model_dir, session_name, i_episode, best): filename = os.path.join( model_dir, f'ddpg_{session_name}-EP_{i_episode}-score_{best:.3f}.pt') filename_best = os.path.join(model_dir, f'ddpg_{session_name}-best.pt') save_dict_list = [] save_dict = { 'actor': self.actor_local.state_dict(), 'actor_optim_params': self.actor_optimizer.state_dict(), 'critic': self.critic_local.state_dict(), 'critic_optim_params': self.critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save(save_dict_list, filename) copyfile(filename, filename_best) def postprocess(self, t_step): if self.use_psn and t_step > 0: perturbed_states, perturbed_actions, _, _, _ = self.buffer.tail( t_step) unperturbed_actions = self.act(np.array(perturbed_states), False, False) diff = np.array(perturbed_actions) - unperturbed_actions mean_diff = np.mean(np.square(diff), axis=0) dist = sqrt(np.mean(mean_diff)) self.param_noise.adapt(dist)
class DDPG_single(): def __init__(self, state_dim, action_dim, max_action, num_agents, learning_rate, discrete_action=True, grid_per_action=20, hidden_dim=32): self.max_action = max_action self.actor = Actor_DDPG(state_dim, action_dim, max_action, hidden_dim) self.actor_target = copy.deepcopy(self.actor) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=learning_rate) self.critic = Critic_DDPG(state_dim, action_dim, num_agents, hidden_dim) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=learning_rate) self.exploration = OUNoise(action_dim) self.iter = 0 def scale_noise(self, scale): self.exploration.scale = scale def reset_noise(self): self.exploration.reset() def select_action(self, obs, explore=False): self.actor.eval() action = self.actor(obs) self.actor.train() if explore: device = action.device action += torch.Tensor(self.exploration.noise()).to(device) action = action.clamp(-self.max_action, self.max_action) return action def get_params(self): return { 'actor': self.actor.state_dict(), 'actor_target': self.actor_target.state_dict(), 'critic': self.critic.state_dict(), 'critic_target': self.critic_target.state_dict(), 'actor_optimizer': self.actor_optimizer.state_dict(), 'critic_optimizer': self.critic_optimizer.state_dict() } def load_params(self, params): self.actor.load_state_dict(params['actor']) self.actor_target.load_state_dict(params['actor_target']) self.actor_optimizer.load_state_dict(params['actor_optimizer']) self.critic.load_state_dict(params['critic']) self.critic_target.load_state_dict(params['critic_target']) self.critic_optimizer.load_state_dict(params['critic_optimizer'])
class Agent(): def __init__(self, state_size, action_size, replay_memory, random_seed=0, nb_agent=20, bs=128, gamma=0.99, tau=1e-3, lr_actor=1e-4, lr_critic=1e-4, wd_actor=0, wd_critic=0, clip_actor=None, clip_critic=None, update_interval=20, update_times=10): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.nb_agent = nb_agent self.bs = bs self.update_interval = update_interval self.update_times = update_times self.timestep = 0 self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.wd_critic = wd_critic self.wd_actor = wd_actor self.clip_critic = clip_critic self.clip_actor = clip_actor self.actor_losses = [] self.critic_losses = [] # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor, weight_decay=self.wd_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.wd_critic) # Noise process self.noise = OUNoise((self.nb_agent, action_size), random_seed) # Replay memory self.memory = replay_memory def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" #increment timestep self.timestep += 1 # Save experience / reward for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if self.timestep % self.update_interval == 0: for i in range(self.update_times): if len(self.memory) > self.bs: experiences = self.memory.sample(self.bs) self.learn(experiences) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset_noise(self): self.noise.reset() def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() if self.clip_critic: torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), self.clip_critic) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() if self.clip_actor: torch.nn.utils.clip_grad_norm(self.actor_local.parameters(), self.clip_actor) self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) self.actor_losses.append(actor_loss.cpu().data.numpy()) self.critic_losses.append(critic_loss.cpu().data.numpy()) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
# value_criterion = nn.MSELoss() replay_buffer_size = 1000000 replay_buffer = ReplayBuffer(replay_buffer_size) max_frames = 12000 * NUM_PROCESSES max_steps = 500 frame_idx = 0 episode_rewards = [] batch_size = 128 if __name__ == "__main__": # 초기 상태로 시작 while frame_idx < max_frames: state = envs.reset() ou_noise.reset() episode_reward = 0 for step in range(max_steps): action = policy_net.get_action(state) action = ou_noise.get_action(action, step) next_state, reward, done, _ = envs.step(action) replay_buffer.push(state, action, reward, next_state, done) if len(replay_buffer) > batch_size: ddpg.update(batch_size, replay_buffer) state = next_state episode_reward += reward frame_idx += NUM_PROCESSES