class AlgoA2C(AlgoBase): def __init__(self, num_state, num_action, configDict, train=True): super(AlgoA2C, self).__init__(num_state, num_action, configDict, createResults=False) # parameters of Internal DRL algorithm: ## Memory: self.MEMORY_CAPACITY = 100000 self.GAMMA = 0.95 ## Deep network: self.MEMORY_BATCH_SIZE = 64 # number of data for one training! ?(Maybe we can set MEMORY_BATCH_SIZE = MEMORY_CAPACITY) self.train = train if train: ## RL algorithm: ## Random selection proportion: self.MAX_EPSILON = 1.0 self.MIN_EPSILON = 0.01 self.LAMBDA = 0.005 # speed of decay self.epsilon = self.MAX_EPSILON else: self.epsilon = 0.0 self.brain = Brain(num_state, num_action, configDict, RL_GAMMA=self.GAMMA) self.memory = ExperienceReplay(self.MEMORY_CAPACITY) self.next_model(configDict) def next_model(self, configDict, load=False): super(AlgoA2C, self).next_model(configDict, load) self.brain.set_model(configDict) def load(self): loaded = self.brain.load() self.resultFile.Load() if loaded: self.episodes = self.resultFile.NumRuns() def act(self, state): # action:[0,1,2,...,num_action-1] if random.random() < self.epsilon: action = random.randint(0, self.num_action - 1) else: action = np.argmax( self.brain.predictOne(state_test=state) ) # get the index of the largest number, that is the action we should take. -libn return action def observe(self, s, a, r, s_, done): self.memory.add(experience) # decrease Epsilon to reduce random action and trust more in greedy algorithm def end_episode(self, r, sumR, steps, realR): self.epsilon = self.MIN_EPSILON + (self.MAX_EPSILON - self.MIN_EPSILON) * math.exp( -self.LAMBDA * self.episodes) self.episodes += 1 saveModel = self.resultFile.end_run(r, sumR, steps, realR) if saveModel: self.brain.save_latest() return saveModel, "" def replay(self): pass def learn(self): size = self.memory.num_experience() allHist = self.memory.sample(self.memory.num_experience()) no_state = np.zeros(self.num_state) s = np.array([o[0] for o in batch]) s_ = np.array([(no_state if o[3] is None else o[3]) for o in batch]) a = [int(o[1]) for o in batch] r = [int(o[2]) for o in batch] notDone = [False if o[3] is None else True for o in batch] idxHist = np.arange(self.MEMORY_BATCH_SIZE) v = self.brain.predict(s) v_ = self.brain.predict(s_) # inputs and outputs of the Deep Network: x = np.zeros((size, self.num_state)) y = np.zeros((size, self.num_action)) y = r + self.GAMMA * notDone * np.amax(v_) for e in numEpochs: for i in range(len_batch): o = batch[i] s = o[0] a = int(o[1]) r = o[2] s_ = o[3] v_t = v[i] if s_ is None: v_t[a] = r else: v_t[a] = r + self.GAMMA * np.amax( v_[i] ) # We will get max reward if we select the best option. x[i] = s y[i] = v_t self.brain.train(x, y, batch_size=len_batch) def Results(self, size): return self.resultFile.Results(size)
class Agent(): def __init__(self, state_size, action_size, num_agents, seed, \ gamma=0.99, tau=1e-3, lr_actor=1e-3, lr_critic=1e-2, \ buffer_size = 10e5, buffer_type = 'replay', policy_update = 1): # General info self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) self.t_step = 0 self.gamma = gamma # Actor Network -- Policy-based self.actor = DDPG_Actor(state_size, action_size, hidden_dims=(128, 128), seed=seed) self.target_actor = DDPG_Actor(state_size, action_size, hidden_dims=(128, 128), seed=seed) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor) # Critic Network -- Value-based self.critic = DDPG_Critic(state_size, action_size, hidden_dims=(128, 128), seed=seed) self.target_critic = DDPG_Critic(state_size, action_size, hidden_dims=(128, 128), seed=seed) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic) self.tau = tau # Replay memory self.buffer_type = buffer_type self.memory = ExperienceReplay(action_size, int(buffer_size)) #ExperienceReplay self.per = PrioritizedExperienceReplay(capacity=int(buffer_size), alpha=0.6, beta=0.9, error_offset=0.001) # NormalNoiseStrategy self.normal_noise = NormalNoiseStrategy() # Delayed Updates from TD3 self.policy_update = policy_update def select_action(self, state): return self.normal_noise.select_action(self.actor, state) def select_action_evaluation(self, state): return self.actor(state).cpu().detach().data.numpy().squeeze() def _critic_error(self, state, action, reward, next_state, done): done = int(done) reward = float(reward) with torch.no_grad(): argmax_a = self.target_actor(next_state) q_target_next = self.target_critic(next_state, argmax_a) q_target = reward + (self.gamma * q_target_next * (1 - done)) q_expected = self.critic(state, action) td_error = q_expected - q_target.detach() return td_error.detach().numpy() def step(self, state, action, reward, next_state, done, batch_size=64): self.t_step += 1 if self.buffer_type == 'prioritized': if self.num_agents == 20: reward = np.asarray(reward)[:, np.newaxis] done = np.asarray(done)[:, np.newaxis] for i in range(self.num_agents): error = self._critic_error(state[i], action[i], reward[i], next_state[i], done[i]) self.per.add(error, (state[i], action[i], reward[i], next_state[i], done[i])) else: done = np.asarray(done) reward = np.asarray(reward) state = state.squeeze() next_state = next_state.squeeze() error = self._critic_error(state, action, reward, next_state, done) self.per.add(error, (state, action, reward, next_state, done)) # train if enough samples if self.t_step > batch_size: experiences, mini_batch, idxs, is_weights = self.per.sample( batch_size) self.learn(experiences, batch_size, idxs, is_weights) # add to replay buffer else: if self.num_agents == 20: reward = np.asarray(reward)[:, np.newaxis] done = np.asarray(done)[:, np.newaxis] for i in range(self.num_agents): self.memory.add(state[i], action[i], reward[i], next_state[i], done[i]) else: self.memory.add(state, action, reward, next_state, done) # train if enough samples if len(self.memory) > batch_size: experiences = self.memory.sample(batch_size) self.learn(experiences, batch_size) def learn(self, experiences, batch_size, idxs=0, is_weights=0): states, actions, rewards, next_states, dones = experiences # *** 1. UPDATE Online Critic Network *** # 1.1. Calculate Targets for Critic argmax_a = self.target_actor(next_states) q_target_next = self.target_critic(next_states, argmax_a) q_target = rewards + (self.gamma * q_target_next * (1 - dones)) q_expected = self.critic(states, actions) # 1.2. Compute loss td_error = q_expected - q_target.detach() if self.buffer_type == 'prioritized': # PER --> update priority with torch.no_grad(): error = td_error.detach().numpy() for i in range(batch_size): idx = idxs[i] self.per.update(idx, error[i]) value_loss = (torch.FloatTensor(is_weights) * td_error.pow(2).mul(0.5)).mean() else: value_loss = td_error.pow(2).mul(0.5).mean() # value_loss = F.mse_loss(q_expected,q_target) # 1.3. Update Critic self.critic_optimizer.zero_grad() value_loss.backward() #torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1) self.critic_optimizer.step() if self.t_step % self.policy_update == 0: """ Delaying Target Networks and Policy Updates from: ***Addressing Function Approximation Error in Actor-Critic Methods*** """ # *** 2. UPDATE Online Actor Network *** argmax_a = self.actor(states) max_val = self.critic(states, argmax_a) policy_loss = -max_val.mean( ) # add minus because its gradient ascent # Update Actor self.actor_optimizer.zero_grad() policy_loss.backward() # torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1) self.actor_optimizer.step() # 3. UPDATE TARGET networks self.soft_update(self.actor, self.target_actor, self.tau) self.soft_update(self.critic, self.target_critic, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DQNAgent: device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') def __init__(self, osize, asize, seed, buffersize=int(1e6), gamma=0.99, epsilon=0.05, epsilondecay=1e6, epsilonmin=0.1, minibatchsize=128, lr=0.01, tau=0.01): """ Initialize DQN agent parameters. """ # initialize agent parameters self.osize = osize self.asize = asize self.gamma = gamma self.epsilon0 = epsilon self.epsilon = epsilon self.epsilondecay = epsilondecay self.epsilonmin = epsilonmin self.minibatchsize = minibatchsize self.lr = lr self.tau = tau self.stepcount = 0 self.loss_log = [] # set the random seed self.seed = torch.manual_seed(seed) # create local and target Q networks self.Q = QNetwork(osize, asize).to(self.device) self.targetQ = QNetwork(osize, asize).to(self.device) # initialize optimizer self.optimizer = optim.Adam(self.Q.parameters(), lr=self.lr) # initialize experience replay self.replay = ExperienceReplay(asize, buffersize, minibatchsize, seed) def step(self, state, action, reward, next_state, done): """ Step the agent, and learn if necessary. """ # add experience to replay self.replay.add(state, action, reward, next_state, done) # learn from experiences if self.replay.__len__() > self.minibatchsize: # create mini batch for learning experiences = self.replay.sample(self.device) # train the agent self.learn(experiences) # increase step count self.stepcount += 1 # decay epsilon decayed_epsilon = self.epsilon * (1 - self.epsilondecay) self.epsilon = max(self.epsilonmin, decayed_epsilon) def get_action(self, state): """ Get an epsilon greedy action. """ # convert network input to torch variable x = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # obtain network output self.Q.eval() with torch.no_grad( ): # do not calculate network gradients which will speed things up y = self.Q(x) self.Q.train() # select action if random.random() > self.epsilon: # epsilon greedy action action = np.argmax( y.cpu().data.numpy()) # action is actually action index else: # random action selection action = np.random.choice(np.arange(self.asize)) return action def learn(self, experiences): """ Learn using Double DQN algorithm. """ # unpack experience states, actions, rewards, next_states, dones = experiences # get the argmax of Q(next_state) a_max = torch.argmax(self.Q(next_states), dim=1).cpu().data.numpy().reshape( (self.minibatchsize, 1)) # obtain the target Q network output target_out = self.targetQ(next_states).detach().data.numpy() target_q = np.array( [tout[aidx] for tout, aidx in zip(target_out, a_max)]) # calculate target and local Qs target = rewards + self.gamma * target_q * (1 - dones) local = self.Q(states).gather(1, actions) # calculate loss loss = F.mse_loss(local, target) self.loss_log.append(loss.cpu().data.numpy()) # perform gradient descent step self.optimizer.zero_grad() # reset the gradients to zero loss.backward() self.optimizer.step() # soft update target network for target_params, params in zip(self.targetQ.parameters(), self.Q.parameters()): target_params.data.copy_(self.tau * params + (1 - self.tau) * target_params.data)
class MADDPG_Agent(): def __init__(self, state_size, action_size, num_agents, \ gamma=0.99, tau=1e-3, lr_actor=1e-3, lr_critic=1e-2, \ buffer_size = 1e5, buffer_type = 'replay', policy_update = 1, \ noise_init = 1.0, noise_decay=0.9995, min_noise=0.1): # General info self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.t_step = 0 self.gamma = gamma # Actor Networks -- Policy-based self.actors = [ DDPG_Actor(state_size, action_size, hidden_dims=(128, 128)) for i in range(num_agents) ] self.actor_optimizers = [ optim.Adam(actor.parameters(), lr=lr_actor) for actor in self.actors ] # targets self.target_actors = [ DDPG_Actor(state_size, action_size, hidden_dims=(128, 128)) for i in range(num_agents) ] [ self.hard_update(self.actors[i], self.target_actors[i]) for i in range(num_agents) ] # Critic Network -- Value-based --> in this approach we will use one common network for all the actors self.critic = DDPG_Critic(state_size, action_size, hidden_dims=(128, 128)) self.target_critic = DDPG_Critic(state_size, action_size, hidden_dims=(128, 128)) self.hard_update(self.critic, self.target_critic) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic) # How to update networks self.tau = tau self.policy_update = policy_update # Replay memory self.buffer_type = buffer_type self.memory = ExperienceReplay(action_size, int(buffer_size)) #ExperienceReplay self.per = PrioritizedExperienceReplay(capacity=int(buffer_size), alpha=0.6, beta=0.9, error_offset=0.001) # NormalNoiseStrategy self.normal_noise = NormalNoiseStrategy(noise_init=noise_init,\ noise_decay=noise_decay,\ min_noise_ratio = min_noise) def select_action(self, state): actions = [] for i in range(self.num_agents): actions.append( self.normal_noise.select_action(self.actors[i], state[i])) return np.array(actions) def select_action_evaluation(self, state): actions = [] for i in range(self.num_agents): actions.append(self.actors[i]( state[i]).cpu().detach().data.numpy().squeeze()) return np.array(actions) def _critic_error(self, state, action, reward, next_state, done): states = torch.Tensor(state).view(-1, self.num_agents * self.state_size) # batch X 2*24 next_states = torch.Tensor(next_state).view( -1, self.num_agents * self.state_size) # batch X 2*24 actions = torch.Tensor(action).view(-1, self.num_agents * self.action_size) # batch X 2*2 rewards = torch.Tensor(reward).view(-1, self.num_agents * 1) dones = torch.Tensor(done.astype(int)).view(-1, self.num_agents * 1) with torch.no_grad(): # 1.1. Calculate Target target_actions = [] for i in range(self.num_agents): target_actions.append(self.target_actors[i]( next_states[:, self.state_size * i:self.state_size * (i + 1)])) target_actions = torch.stack( target_actions ) # shape: 2(num_agents) x batch x 2(num_actions) target_actions = target_actions.permute( 1, 0, 2) # transform from 2 X batch_size X 2 --> batch_size X 2 X 2 target_actions = target_actions.contiguous().view( -1, self.num_agents * self.action_size) # batch_size X 2*2 q_target_next = self.target_critic(next_states, target_actions) q_target = rewards + ( self.gamma * q_target_next * (1 - dones) ) # we get batch_size X 2 (one q target for each agent --> we have rewards and dones for each agent) # 1.2. Expected q_expected = self.critic(states, actions) # 1.3. Compute loss td_error = q_expected - q_target.detach() return td_error.mean().detach().numpy() def step(self, state, action, reward, next_state, done, batch_size=64): self.t_step += 1 #increment number of visits # transform to np.array with proper shapes reward = np.asarray(reward)[:, np.newaxis] done = np.asarray(done)[:, np.newaxis] # add experiences to buffer(PER | Replay) and learn in case of having enough samples if self.buffer_type == 'prioritized': for i in range(self.num_agents): error = self._critic_error(state, action, reward, next_state, done) self.per.add(error, (state, action, reward, next_state, done)) # train if enough samples if self.t_step > batch_size: experiences, mini_batch, idxs, is_weights = self.per.sample( batch_size) self.learn(experiences, batch_size, idxs, is_weights) else: #replaybuffer self.memory.add(state, action, reward, next_state, done) # train if enough samples if len(self.memory) > batch_size: experiences = self.memory.sample(batch_size) c_loss, a_loss = self.learn(experiences, batch_size) else: c_loss, a_loss = torch.Tensor([0]), (torch.Tensor([0]), torch.Tensor([0])) return c_loss, a_loss def _update_critic_network(self, experiences, batch_size, idxs, is_weights): states, actions, rewards, next_states, dones = experiences # s,s' --> 64x2x24 # a --> 64x2x2 # r,w --> 64x2x1 # transform to proper shape for the network --> batch_size X expected value states = states.view(-1, self.num_agents * self.state_size) # batch X 2*24 next_states = next_states.view(-1, self.num_agents * self.state_size) # batch X 2*24 actions = actions.view(-1, self.num_agents * self.action_size) # batch X 2*2 rewards = rewards.view(-1, self.num_agents * 1) dones = dones.view(-1, self.num_agents * 1) # 1.1. Calculate Target target_actions = [] for i in range(self.num_agents): target_actions.append(self.target_actors[i]( next_states[:, self.state_size * i:self.state_size * (i + 1)])) target_actions = torch.stack( target_actions) # shape: 2(num_agents) x batch x 2(num_actions) # transform to proper shape target_actions = target_actions.permute( 1, 0, 2) # transform from 2 X batch_size X 2 --> batch_size X 2 X 2 target_actions = target_actions.contiguous().view( -1, self.num_agents * self.action_size) # batch_size X 2*2 q_target_next = self.target_critic(next_states, target_actions) q_target = rewards + ( self.gamma * q_target_next * (1 - dones) ) # we get batch_size X 2 (one q target for each agent --> we have rewards and dones for each agent) # 1.2. Expected q_expected = self.critic(states, actions) # 1.3. Compute loss td_error = q_expected - q_target.detach() if self.buffer_type == 'prioritized': # PER --> update priority with torch.no_grad(): error = td_error.detach().numpy() for i in range(batch_size): idx = idxs[i] self.per.update(idx, error[i]) value_loss = (torch.FloatTensor(is_weights) * td_error.pow(2).mul(0.5)).mean() else: value_loss = td_error.pow(2).mul(0.5).mean() # value_loss = F.mse_loss(q_expected,q_target) # 1.4. Update Critic self.critic_optimizer.zero_grad() value_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1) self.critic_optimizer.step() return value_loss def _update_actor_networks(self, experiences): states, actions, rewards, next_states, dones = experiences # transform to proper shape for the network --> batch_size X expected value states = states.view(-1, self.num_agents * self.state_size) # batch X 2*24 next_states = next_states.view(-1, self.num_agents * self.state_size) # batch X 2*24 actions = actions.view(-1, self.num_agents * self.action_size) # batch X 2*2 rewards = rewards.view(-1, self.num_agents * 1) dones = dones.view(-1, self.num_agents * 1) policy_losses = [] for ID_actor in range(self.num_agents): # load network and optimizer optimizer = self.actor_optimizers[ID_actor] actor = self.actors[ID_actor] q_input_actions = [] for i in range(self.num_agents): q_input_actions.append( actor(states[:, self.state_size * i:self.state_size * (i + 1)])) #only states of the current agent q_input_actions = torch.stack(q_input_actions) # transform to proper shape q_input_actions = q_input_actions.permute( 1, 0, 2) # transform from 2 X batch_size X 2 --> batch_size X 2 X 2 q_input_actions = q_input_actions.contiguous().view( -1, self.num_agents * self.action_size) # batch_size X 2*2 max_val = self.critic(states, q_input_actions) policy_loss = -max_val.mean( ) # add minus because its gradient ascent policy_losses.append(policy_loss) optimizer.zero_grad() policy_loss.backward() torch.nn.utils.clip_grad_norm_(self.actors[ID_actor].parameters(), 1) optimizer.step() # save new network and optimizer state self.actor_optimizers[ID_actor] = optimizer self.actors[ID_actor] = actor return policy_losses[0], policy_losses[1] def learn(self, experiences, batch_size, idxs=0, is_weights=0): # *** 1. UPDATE Online Critic Network *** critic_loss = self._update_critic_network(experiences, batch_size, idxs, is_weights) if self.t_step % self.policy_update == 0: # *** 2. UPDATE Online Actor Networks *** actor_loss = self._update_actor_networks(experiences) # *** 3. UPDATE TARGET/Offline networks *** for i in range(self.num_agents): self.soft_update(self.actors[i], self.target_actors[i], self.tau) self.soft_update(self.critic, self.target_critic, self.tau) return critic_loss, actor_loss def hard_update(self, local_model, target_model): """Hard update model parameters. Copy the values of local network into the target. θ_target = θ_local Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)