def test_training(): config = Config() n = 1 env = make_parallel_env(n, 100000) update_config(env, config) model_path = "/home/liub/Desktop/mount/teamstrategy/oldmodels/mpe/aqmix+coach+vi2+ctr8+l10.0001+l20.0001/run0" #model_path = "/home/liub/Desktop/mount/teamstrategy/models/mpe/aqmix+ctr8+l10.0001+l20.0001/run0" # setup modules mac = Agent(config) # policy qlearner = QLearner(mac, config) qlearner.load_models(model_path) qlearner.cuda() all_rewards = [] #orders = tt_orders = 0 orders = 0 tt_orders = 1e-12 for it in tqdm(range(100)): o, e, c, m, ms = reset_wrapper(env) rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1]) episode_reward = 0 prev_z = None for t in range(config.max_steps): o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms) if config.has_coach and t % config.centralized_every == 0: _, z_team, logvar = qlearner.coach(o_, e_, c_, ms_) if prev_z is None: mac.set_team_strategy(z_team) prev_z = z_team else: bs, n = z_team.shape[:2] mask = ms_.sum(-1).gt(0).float() #normal = D.Normal(z_team, (0.5*logvar).exp()) #logprob = normal.log_prob(prev_z).sum(-1) #prob = logprob.exp() #broadcast = (prob > 0.001).float() #import pdb; pdb.set_trace() l2 = (z_team - prev_z).pow(2).sum(-1).sqrt() broadcast = (l2 > 5).float() mac.set_part_team_strategy(z_team, broadcast) #import pdb; pdb.set_trace() orders += (broadcast * mask).sum() tt_orders += mask.sum() prev_z = mac.z_team.clone() actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, epsilon=0.) o, e, m, ms, r, d = step_wrapper(env, actions) episode_reward += r.sum() all_rewards.append(episode_reward) all_rewards = np.array(all_rewards) print(f"broadcast rate {orders/tt_orders}") print(f"mean reward {all_rewards.mean()} | std reward {all_rewards.std()}") return all_rewards.mean()
def test_exp(config, fn, exp, threshold=0.): env = make_parallel_env(1, 9999, fn) update_config(env, config) config.method = exp k = exp.find("ctr") config.centralized_every = int(exp[k + 3:k + 4]) if "165" in exp: config.agent_hidden_dim = 165 else: config.agent_hidden_dim = 128 if "coach" in exp: config.has_coach = True # setup modules mac = Agent(config) # policy qlearner = QLearner(mac, config) R = [] OR = [] for run_num in tqdm([0, 1, 2, 3, 4]): model_path = f"/home/liub/Desktop/mount/teamstrategy/coach1/mpe/{exp}/run{run_num}" qlearner.load_models(model_path) qlearner.cuda() reward = 0 n_orders = 0 n_total_orders = 1e-12 for n_ep in range(n_eval): o, e, c, m, ms = reset_wrapper(env) prev_a = torch.zeros(o.shape[0], o.shape[1]).long().to(config.device) rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1]) prev_z = None for t in range(145): if "full" in exp: m = ms if "interval" in exp and t % config.centralized_every == 0: m = ms o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms) if config.has_coach and t % config.centralized_every == 0: ma = ms_.sum(-1).gt(0).float() with torch.no_grad(): _, z_team, _ = qlearner.coach(o_, e_, c_, ms_) if prev_z is None: mac.set_team_strategy(z_team * ma.unsqueeze(-1)) prev_z = z_team n_orders += ma.sum().item() n_total_orders += ma.sum().item() else: bs, n = z_team.shape[:2] #normal = D.Normal(z_team, (0.5*logvar).exp()) #logprob = normal.log_prob(prev_z).sum(-1) #prob = logprob.exp() #broadcast = (prob > 0.001).float() #import pdb; pdb.set_trace() l2 = (z_team * ma.unsqueeze(-1) - prev_z * ma.unsqueeze(-1)).pow(2).sum(-1).sqrt() broadcast = (l2 > threshold).float() mac.set_part_team_strategy(z_team, broadcast) #import pdb; pdb.set_trace() n_orders += broadcast.sum().item() n_total_orders += ma.sum().item() prev_z = mac.z_team.clone() actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, prev_a, 0) prev_a = torch.LongTensor(actions).to(config.device) o, e, m, ms, r, d = step_wrapper(env, actions) reward += r.sum() reward = reward / n_eval rate = n_orders / n_total_orders R.append(reward) OR.append(rate) R = np.array(R) OR = np.array(OR) print( f"{exp:30s}[{threshold:3d}] | muR: {R.mean():.4f} stdR: {R.std()/np.sqrt(5):.4f} | muC: {OR.mean():.4f} stdC: {OR.std()/np.sqrt(5):.4f}" ) return R.mean(), R.std(), OR.mean(), OR.std()