r_p = p_z[signal[i]] ep_rewards[i].append(r_p) if steps%T==0: for i in range(n_agent): meta_rewards[i].append(utili[i]/(0.1+abs(rat[i]))) ep_actions[i] = np.array(ep_actions[i]) ep_rewards[i] = np.array(ep_rewards[i], dtype=np.float_) ep_states[i] = np.array(ep_states[i]) if LAMBDA < -0.1: targets = discount_rewards(ep_rewards[i], GAMMA) V[i][signal[i]].update(ep_states[i], targets) vs = V[i][signal[i]].get(ep_states[i]) else: vs = V[i][signal[i]].get(ep_states[i]) targets = eligibility_traces(ep_rewards[i], vs, V[i][signal[i]].get(copy.deepcopy([obs[i]])), GAMMA, LAMBDA) V[i][signal[i]].update(ep_states[i], targets) ep_advantages = targets - vs ep_advantages = (ep_advantages - np.mean(ep_advantages))/(np.std(ep_advantages)+0.0000000001) Pi[i][signal[i]].update(ep_states[i], ep_actions[i], ep_advantages) ep_actions = [[] for _ in range(n_agent)] ep_rewards = [[] for _ in range(n_agent)] ep_states = [[] for _ in range(n_agent)] if render: env.render() for i in range(n_agent): if len(meta_rewards[i])==0: continue
meta_state_s = np.concatenate( (meta_state, to_categorical([i] * T, n_agent), others_action), axis=1) amstate.append(meta_state_s) qsall = meta_Q.get(meta_state_s) allqsa.append(qsall) qsa = (qsall * ep_actions[i]).sum(axis=-1) nothers_action = next_action[rmmyindex, :].reshape(-1) next_meta_state_s = np.concatenate( (mobs, to_categorical(i, n_agent), nothers_action)) next_qsall = meta_Q.get([next_meta_state_s]) next_qsa = (next_qsall * next_action[i]).sum(axis=-1) ltarget = eligibility_traces(ep_rewards[i], qsa, next_qsa, GAMMA, LAMBDA) targets.append(ltarget) targets = np.array(targets).transpose() amstate = np.array(amstate) s, a, t = ([], [], []) for i in range(n_agent): s.append(amstate[i]) a.append(ep_actions[i]) t.append(targets[:, i]) s = np.array(s).reshape((T * n_agent, -1)) a = np.array(a).reshape((T * n_agent, -1)) t = np.array(t).reshape((T * n_agent, -1)) meta_Q.update(s, a, t[:, 0]) #compute counterfactual
for i in range(n_agent): ep_rewards[i].append(rewards[i]) if steps % T == 0: for i in range(n_agent): ep_actions[i] = np.array(ep_actions[i]) ep_rewards[i] = np.array(ep_rewards[i], dtype=np.float_) ep_states[i] = np.array(ep_states[i]) if LAMBDA < -0.1: targets = discount_rewards(ep_rewards[i], GAMMA) V[i].update(ep_states[i], targets) vs = V[i].get(ep_states[i]) else: vs = V[i].get(ep_states[i]) targets = eligibility_traces( ep_rewards[i], vs, V[i].get(copy.deepcopy([obs[i]])), GAMMA, LAMBDA) V[i].update(ep_states[i], targets) ep_advantages = targets - vs ep_advantages = (ep_advantages - np.mean(ep_advantages)) / ( np.std(ep_advantages) + 0.0000000001) Pi[i].update(ep_states[i], ep_actions[i], ep_advantages) ep_actions = [[] for _ in range(n_agent)] ep_rewards = [[] for _ in range(n_agent)] ep_states = [[] for _ in range(n_agent)] if render: env.render() print(i_episode)
vs = V[i].get(ep_states[i]) else: gV[i].update(ep_states[i], targets) vs = gV[i].get(ep_states[i]) else: next_s = copy.deepcopy(obs[i]) if not greedy[i]: vs = V[i].get(ep_states[i]) more_obs = gPi[i].get_dist(np.array([obs[i]]))[0] next_s.extend(more_obs) more_obs = get_more_obs_com(True, neighbors, average_jpi, i, more_obs_size) next_s.extend(more_obs) targets = eligibility_traces(ep_rewards[i], vs, V[i].get([next_s]), GAMMA, LAMBDA) V[i].update(ep_states[i], targets) else: vs = gV[i].get(ep_states[i]) targets = eligibility_traces(ep_rewards[i], vs, gV[i].get([next_s]), GAMMA, LAMBDA) gV[i].update(ep_states[i], targets) ep_advantages = targets - vs ep_advantages = (ep_advantages - np.mean(ep_advantages)) / ( np.std(ep_advantages) + 0.0000000001) all_ep_advantages.append(ep_advantages) all_ep_advantages = np.array(all_ep_advantages)
meta_state_s = np.concatenate( (meta_state, to_categorical([i] * T, n_agent), others_action), axis=1) amstate.append(meta_state_s) qsall = meta_Q.get(meta_state_s) allqsa.append(qsall) qsa = (qsall * ep_actions[i]).sum(axis=-1) nothers_action = next_action[rmmyindex, :].reshape(-1) next_meta_state_s = np.concatenate( (mobs, to_categorical(i, n_agent), nothers_action)) next_qsall = meta_Q.get([next_meta_state_s]) next_qsa = (next_qsall * next_action[i]).sum(axis=-1) ltarget = eligibility_traces(meta_rewards, qsa, next_qsa, GAMMA, LAMBDA) targets.append(ltarget) targets = np.array(targets).transpose() amstate = np.array(amstate) s, a, t = ([], [], []) for i in range(n_agent): s.append(amstate[i]) a.append(ep_actions[i]) t.append(targets[:, i]) s = np.array(s).reshape((T * n_agent, -1)) a = np.array(a).reshape((T * n_agent, -1)) t = np.array(t).reshape((T * n_agent, -1)) meta_Q.update(s, a, t[:, 0]) #compute counterfactual