def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.MU = 0 self.THETA = 0.15 self.SIGMA = 0.10 self.GAMMA = 0.99 self.TAU = 0.001 self.BATCHS = 256 self.MAX_REWARD = -999999999 self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.noiseObj = Noise(self.action_size, self.MU, self.THETA, self.SIGMA) self.replayObj = Replay(self.BATCHS)
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.0 #0 self.exploration_theta = 0.125 # 0.14 | 0.1 self.exploration_sigma = 0.0009 # 0.001 | 0.2 | 0.001 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.998 # 0.99 | 0.9 | discount factor self.tau = 0.099 # 0.001| 0.01 | 0.1 | 0.05 | for soft update of target parameters # Score tracker self.best_score = -np.inf self.score = 0
def __init__(self,state_size, action_size,single_rotor_control=False, buffer_size=int(1e5), batch_size=128, gamma=0.98, tau=1e-3, lr_actor=1e-4,lr_critic=1e-3, random_seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size # rotor control mode self.single_rotor_control = single_rotor_control if self.single_rotor_control: action_size = 1 # define hyperparameters self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(size=action_size, seed=random_seed) # Replay memory self.memory = ReplayBuffer(buffer_size, random_seed) # counter for time steps self.time_step = 0
def __init__(self, task, gym=False): self.task = task if gym: self.state_size = np.prod(task.observation_space.shape) self.action_size = np.prod(task.action_space.shape) self.action_low = task.action_space.low self.action_high = task.action_space.high else: self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.alpha = task.alpha self.beta = task.beta # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.alpha) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.alpha) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, self.beta) self.critic_target = Critic(self.state_size, self.action_size, self.beta) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 # changed from 0 self.exploration_theta = 0.15 # changed from 0.15 self.exploration_sigma = 3.2 # changed from 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000 # was 100000 originally self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor, 0.99 self.tau = .001 # for soft update of target parameters, 0.01 originally
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.0 self.exploration_theta = 0.15 #0.15 self.exploration_sigma = 0.25 #0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 16 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.9 # discount factor self.tau = 0.001 # for soft update of target parameters self.total_reward = 0 self.count = 0 self.best_score = -np.inf self.score = 0 def reset_episode(self): self.total_reward = 0 self.count = 0 self.score = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): self.total_reward += reward self.count += 1 # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): """Returns actions for given state(s) as per current policy.""" states = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(states)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" self.score = self.total_reward / float(self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack([e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch([next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape(self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self,state_size, action_size,single_rotor_control=False, buffer_size=int(1e5), batch_size=128, gamma=0.98, tau=1e-3, lr_actor=1e-4,lr_critic=1e-3, random_seed=0): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size # rotor control mode self.single_rotor_control = single_rotor_control if self.single_rotor_control: action_size = 1 # define hyperparameters self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_actor = lr_actor self.lr_critic = lr_critic self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(size=action_size, seed=random_seed) # Replay memory self.memory = ReplayBuffer(buffer_size, random_seed) # counter for time steps self.time_step = 0 #self.soft_update(self.critic_local, self.critic_target, 1.0) #self.soft_update(self.actor_local, self.actor_target, 1.0) def step(self,state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward if self.single_rotor_control: self.memory.add(state, action[0], reward, next_state, done) else: self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample(self.batch_size) self.learn(experiences, self.gamma) # increase time step count self.time_step+=1 def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states.unsqueeze(0)).cpu().data.numpy()[0] self.actor_local.train() if add_noise: #noise = np.repeat(,self.action_size) actions += self.noise.sample() if self.single_rotor_control: actions = np.repeat(actions,self.action_size) return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, idxs, is_weights = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute Q value predictions Q_expected = self.critic_local(states, actions) # compute td error td_error = Q_targets - Q_expected # update td error in Replay buffer self.memory.update_priorities(idxs,td_error.detach().cpu().numpy().squeeze()) # compute critic loss critic_loss = ((is_weights*td_error)**2).mean() # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -(is_weights * self.critic_local(states, actions_pred).squeeze()).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor Policy Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic Value Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.0 self.exploration_theta = 0.15 #0.15 self.exploration_sigma = 0.25 #0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) #Replay Memory self.buffer_size = 100000 self.batch_size = 16 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 self.tau = 0.01 def reset_episode(self): #reset of the episode self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): #Save experience/reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) #last state and action self.last_state = next_state def act(self, state): #Returns actions for given state(s) as per current policy state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] #add some noise for exploration return list(action + self.noise.sample()) def learn(self, experiences): #Update policy and value parameters using given batch of experience tuples # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) #Get predicted next-state actions and Q values from target models #Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) #Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) #Train actor model action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) #Training function self.actor_local.train_fn([states, action_gradients, 1]) # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) #Local model parameters and target model parameters should have the same size assert len(local_weights) == len(target_weights) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent(object): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high #Parames self.MU = 0 self.THETA = 0.15 self.SIGMA = 0.10 self.GAMMA = 0.99 self.TAU = 0.001 self.BATCHS = 256 self.MAX_REWARD = -999999999 #Actor Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) #init self.actor_target.model.set_weights( self.actor_local.model.get_weights()) #Critic Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) #init self.critic_target.model.set_weights( self.critic_local.model.get_weights()) #Noise process self.noiseObj = Noise(self.action_size, self.MU, self.THETA, self.SIGMA) #Replay memory self.replayObj = Replay(self.BATCHS) def reset_episode(self): self.count = 0 self.total_reward = 0 self.noiseObj.reset() state = self.task.reset() self.last_state = state return (state) def step(self, action, reward, next_state, done): self.replayObj.add(self.last_state, action, reward, next_state, done) self.total_reward += reward self.count += 1 if self.total_reward > self.MAX_REWARD: self.MAX_REWARD = self.total_reward if len(self.replayObj) > self.BATCHS: experiences = self.replayObj.sample() self.learn(experiences) self.last_state = next_state def act(self, states): action = self.actor_local.model.predict( np.reshape(states, [-1, self.state_size]))[0] return (list(action + self.noiseObj.sample())) def learn(self, experiences): states = np.array([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).reshape(-1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None]).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).reshape(-1, 1) next_state = np.array( [e.next_state for e in experiences if e is not None]) #获取预测next_state的actions 和目标模型的Q值 next_actions = self.actor_target.model.predict_on_batch(next_state) next_Q_targets = self.critic_target.model.predict_on_batch( [next_state, next_actions]) Q_targets = rewards + self.GAMMA * next_Q_targets * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) #训练actor 模型 action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) self.update(self.critic_local.model, self.critic_target.model) self.update(self.actor_local.model, self.actor_target.model) def update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) new_weights = self.TAU * local_weights + (1 - self.TAU) * target_weights target_model.set_weights(new_weights)
def __init__(self, state_size, action_size, num_agents, device, seed=23520, GRADIENT_CLIP=1, ACTIVATION=F.relu, BOOTSTRAP_SIZE=5, GAMMA=0.99, TAU=1e-3, LR_CRITIC=5e-4, LR_ACTOR=5e-4, UPDATE_EVERY=1, TRANSFER_EVERY=2, UPDATE_LOOP=10, ADD_NOISE_EVERY=5, WEIGHT_DECAY=0, MEMORY_SIZE=5e4, BATCH_SIZE=64): """Initialize an Agent object. Params ====== state_size : dimension of each state action_size : dimension of each action num_agents : number of running agents device: cpu or cuda:0 if available -----These are hyperparameters---- BOOTSTRAP_SIZE : How far ahead to bootstrap GAMMA : Discount factor TAU : Parameter for performing soft updates of target parameters LR_CRITIC, LR_ACTOR : Learning rate of the networks UPDATE_EVERY : How often to update the networks TRANSFER_EVERY : How often to transfer the weights from local to target UPDATE_LOOP : Number of iterations for network update ADD_NOISE_EVERY : How often to add noise to favor exploration WEIGHT_DECAY : L2 weight decay for critic optimizer GRADIENT_CLIP : Limit of gradient to be clipped, to avoid exploding gradient issue """ # Actor networks self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optim = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR, weight_decay=WEIGHT_DECAY) hard_update(self.actor_local, self.actor_target) #critic networks self.critic_local = Critic(state_size * 2, action_size).to(device) self.critic_target = Critic(state_size * 2, action_size).to(device) self.critic_optim = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) hard_update(self.critic_local, self.critic_target) self.device = device self.num_agents = num_agents # Noise : using simple noise instead of OUNoise self.noise = [ SimpleNoise(action_size, scale=1) for i in range(num_agents) ] # Replay memory self.memory = ReplayBuffer(action_size, device, int(MEMORY_SIZE), BATCH_SIZE, seed) # Initialize time steps (for updating every UPDATE_EVERY steps) self.u_step = 0 self.n_step = 0 #keeping hyperparameters within the instance self.BOOTSTRAP_SIZE = BOOTSTRAP_SIZE self.GAMMA = GAMMA self.TAU = TAU self.LR_CRITIC = LR_CRITIC self.LR_ACTOR = LR_ACTOR self.UPDATE_EVERY = UPDATE_EVERY self.TRANSFER_EVERY = TRANSFER_EVERY self.UPDATE_LOOP = UPDATE_LOOP self.ADD_NOISE_EVERY = ADD_NOISE_EVERY self.GRADIENT_CLIP = GRADIENT_CLIP # initialize these variables to store the information of the n-previous timestep that are necessary to apply the bootstrap_size self.rewards = deque(maxlen=BOOTSTRAP_SIZE) self.states = deque(maxlen=BOOTSTRAP_SIZE) self.actions = deque(maxlen=BOOTSTRAP_SIZE) self.gammas = np.array([[GAMMA**i for j in range(num_agents)] for i in range(BOOTSTRAP_SIZE)]) self.loss_function = torch.nn.SmoothL1Loss()
class MADDPG(): """ Class definition of MADDPG agent. Interacts with and learns from the environment Comprises of a pair of Actor-Critic network ad implements centralized training and decentralized exeution (learn function) """ def __init__(self, state_size, action_size, num_agents, device, seed=23520, GRADIENT_CLIP=1, ACTIVATION=F.relu, BOOTSTRAP_SIZE=5, GAMMA=0.99, TAU=1e-3, LR_CRITIC=5e-4, LR_ACTOR=5e-4, UPDATE_EVERY=1, TRANSFER_EVERY=2, UPDATE_LOOP=10, ADD_NOISE_EVERY=5, WEIGHT_DECAY=0, MEMORY_SIZE=5e4, BATCH_SIZE=64): """Initialize an Agent object. Params ====== state_size : dimension of each state action_size : dimension of each action num_agents : number of running agents device: cpu or cuda:0 if available -----These are hyperparameters---- BOOTSTRAP_SIZE : How far ahead to bootstrap GAMMA : Discount factor TAU : Parameter for performing soft updates of target parameters LR_CRITIC, LR_ACTOR : Learning rate of the networks UPDATE_EVERY : How often to update the networks TRANSFER_EVERY : How often to transfer the weights from local to target UPDATE_LOOP : Number of iterations for network update ADD_NOISE_EVERY : How often to add noise to favor exploration WEIGHT_DECAY : L2 weight decay for critic optimizer GRADIENT_CLIP : Limit of gradient to be clipped, to avoid exploding gradient issue """ # Actor networks self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optim = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR, weight_decay=WEIGHT_DECAY) hard_update(self.actor_local, self.actor_target) #critic networks self.critic_local = Critic(state_size * 2, action_size).to(device) self.critic_target = Critic(state_size * 2, action_size).to(device) self.critic_optim = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) hard_update(self.critic_local, self.critic_target) self.device = device self.num_agents = num_agents # Noise : using simple noise instead of OUNoise self.noise = [ SimpleNoise(action_size, scale=1) for i in range(num_agents) ] # Replay memory self.memory = ReplayBuffer(action_size, device, int(MEMORY_SIZE), BATCH_SIZE, seed) # Initialize time steps (for updating every UPDATE_EVERY steps) self.u_step = 0 self.n_step = 0 #keeping hyperparameters within the instance self.BOOTSTRAP_SIZE = BOOTSTRAP_SIZE self.GAMMA = GAMMA self.TAU = TAU self.LR_CRITIC = LR_CRITIC self.LR_ACTOR = LR_ACTOR self.UPDATE_EVERY = UPDATE_EVERY self.TRANSFER_EVERY = TRANSFER_EVERY self.UPDATE_LOOP = UPDATE_LOOP self.ADD_NOISE_EVERY = ADD_NOISE_EVERY self.GRADIENT_CLIP = GRADIENT_CLIP # initialize these variables to store the information of the n-previous timestep that are necessary to apply the bootstrap_size self.rewards = deque(maxlen=BOOTSTRAP_SIZE) self.states = deque(maxlen=BOOTSTRAP_SIZE) self.actions = deque(maxlen=BOOTSTRAP_SIZE) self.gammas = np.array([[GAMMA**i for j in range(num_agents)] for i in range(BOOTSTRAP_SIZE)]) self.loss_function = torch.nn.SmoothL1Loss() def reset(self): if self.noise: for n in self.noise: n.reset() def set_noise(self, noise): self.noise = noise def save(self, filename): torch.save(self.actor_local.state_dict(), "{}_actor_local.pth".format(filename)) torch.save(self.actor_target.state_dict(), "{}_actor_target.pth".format(filename)) torch.save(self.critic_local.state_dict(), "{}_critic_local.pth".format(filename)) torch.save(self.critic_target.state_dict(), "{}_critic_target.pth".format(filename)) def load(self, path): self.actor_local.load_state_dict(torch.load(path + "_actor_local.pth")) self.actor_target.load_state_dict( torch.load(path + "_actor_target.pth")) self.critic_local.load_state_dict( torch.load(path + "_critic_local.pth")) self.critic_target.load_state_dict( torch.load(path + "_critic_target.pth")) def act(self, states, noise=0.0): """ Returns actions of each actor for given states. Params ====== state : current states add_noise: Introduce some noise in agent's action or not. During training, this is necessary to promote the exploration but should not be used during validation """ actions = None self.n_step = (self.n_step + 1) % self.ADD_NOISE_EVERY with torch.no_grad(): self.actor_local.eval() states = torch.from_numpy(states).float().unsqueeze(0).to( self.device) actions = self.actor_local(states).squeeze().cpu().data.numpy() self.actor_local.train() if self.n_step == 0: for i in range(len(actions)): actions[i] += noise * self.noise[i].sample() return actions def step(self, states, actions, rewards, next_states, dones): """ Take a step for the current episode 1. Save the experience 2. Bootstrap the rewards 3. If update conditions are statisfied, perform learning on required number of loops """ # Save experience in replay memory self.rewards.append(rewards) self.states.append(states) self.actions.append(actions) if len(self.rewards) == self.BOOTSTRAP_SIZE: # get the bootstrapped sum of rewards per agents reward = np.sum(self.rewards * self.gammas, axis=-2) self.memory.add(self.states[0], self.actions[0], reward, next_states, dones) if np.any(dones): self.rewards.clear() self.states.clear() self.actions.clear() # Learn every UPDATE_EVERY timesteps self.u_step = (self.u_step + 1) % self.UPDATE_EVERY t_step = 0 if len(self.memory) > self.memory.batch_size and self.u_step == 0: for _ in range(self.UPDATE_LOOP): self.learn() # transfer the weights as specified t_step = (t_step + 1) % self.TRANSFER_EVERY if t_step == 0: soft_update(self.actor_local, self.actor_target, self.TAU) soft_update(self.critic_local, self.critic_target, self.TAU) def transform_states(self, states): """ Transforms states to full states so that both agents can see each others state via the critic network """ batch_size = states.shape[0] state_size = states.shape[-1] num_agents = states.shape[-2] transformed_states = torch.zeros( (batch_size, num_agents, state_size * num_agents)).to(self.device) for i in range(num_agents): start = 0 for j in range(num_agents): transformed_states[:, i, start:start + state_size] += states[:, j] start += state_size return transformed_states def learn(self): """ Update the network parameters using the experiences. The algorithm is described in detail in readme Params ====== experiences : List of (s, a, r, s', done) tuples """ # sample the memory to disrupt the internal correlation states, actions, rewards, next_states, dones = self.memory.sample() full_states = self.transform_states(states) # The critic should estimate the value of the states to be equal to rewards plus # the estimation of the next_states value according to the critic_target and actor_target with torch.no_grad(): self.actor_target.eval() self.critic_target.eval() # obtain next actions as given by the target network and get transformed states for the critic next_actions = self.actor_target(next_states) next_full_states = self.transform_states(next_states) # calculate Q value using transformed next states and next actions, basically predict what the next value is from target's perspective q_next = self.critic_target(next_full_states, next_actions).squeeze(-1) # calculate the target's value targeted_value = rewards + ( self.GAMMA**self.BOOTSTRAP_SIZE) * q_next * (1 - dones) current_value = self.critic_local(full_states, actions).squeeze(-1) loss = self.loss_function(current_value, targeted_value) # During the optimization, the critic tells how much the value is off from the action value and adjusts the network towards. Basically, the critic takes the actions predicted by the actor, and tells how good or bad they are by calculating its Q-value # calculate the loss of the critic network and backpropagate self.critic_optim.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), self.GRADIENT_CLIP) self.critic_optim.step() # optimize the actor by having the critic evaluating the value of the actor's decision self.actor_optim.zero_grad() actions_pred = self.actor_local(states) mean = self.critic_local(full_states, actions_pred).mean() (-mean).backward() self.actor_optim.step()