ep_len += 1 buff.add(obs, p, reward, next_obs, terminated) obs = next_obs if (terminated) | (ep_len == max_ep_len): obs = env.reset() terminated = False ep_len = 0 if setps % 10000 == 0: print(test_agent()) if (setps < 1000) | (setps % 50 != 0): continue for e in range(50): batch = buff.getBatch(batch_size) for j in range(batch_size): X[j] = batch[j][0] next_X[j] = batch[j][3] for i in range(n_ant): A[i][j] = batch[j][1][i] Q_target = agents.Q_tot_tar.predict(next_X, batch_size=batch_size) for j in range(batch_size): Q_target[j] = batch[j][2] + Q_target[j] * gamma * (1 - batch[j][4]) agents.train_critic(X, A, Q_target) agents.train_actors(X) agents.update()
sum_reward += reward setps += 1 buff.add(obs, p, reward, next_obs, terminated) obs = next_obs if terminated: obs = env.reset() terminated = False reward_list.append(sum_reward) sum_reward = 0 if buff.pointer > buffer_size: print(np.mean(reward_list)) reward_list = [] for k in range(num_ite): states, actions, returns, next_states, dones, gammas = buff.getBatch(mini_batch) Q_target = agents.compute_target([next_states])[0] Q_target = returns + Q_target*gammas*(1 - dones) agents.train_critic(states, actions, Q_target) agents.update() states, actions, returns, next_states, dones, gammas = buff.getBatch(2000) advantages = agents.compute_advantage([states]+[actions[i] for i in range(n_ant)]) if advantage_norm: for i in range(n_ant): advantages[i] = (advantages[i] - advantages[i].mean())/(advantages[i].std()+1e-8) agents.train_actors(states, actions, advantages) buff.reset()