Example #1
0
def test_training():
    config = Config()
    n = 1
    env = make_parallel_env(n, 100000)
    update_config(env, config)

    model_path = "/home/liub/Desktop/mount/teamstrategy/oldmodels/mpe/aqmix+coach+vi2+ctr8+l10.0001+l20.0001/run0"
    #model_path = "/home/liub/Desktop/mount/teamstrategy/models/mpe/aqmix+ctr8+l10.0001+l20.0001/run0"

    # setup modules
    mac = Agent(config) # policy
    qlearner = QLearner(mac, config)
    qlearner.load_models(model_path)
    qlearner.cuda()

    all_rewards = []

    #orders = tt_orders = 0
    orders = 0
    tt_orders = 1e-12
    for it in tqdm(range(100)):
        o, e, c, m, ms = reset_wrapper(env)
        rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1])

        episode_reward = 0
        prev_z = None
        for t in range(config.max_steps):
            o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms)
            if config.has_coach and t % config.centralized_every == 0:
                _, z_team, logvar = qlearner.coach(o_, e_, c_, ms_)
                if prev_z is None:
                    mac.set_team_strategy(z_team)
                    prev_z = z_team
                else:
                    bs, n = z_team.shape[:2]
                    mask = ms_.sum(-1).gt(0).float()
                    #normal = D.Normal(z_team, (0.5*logvar).exp())
                    #logprob = normal.log_prob(prev_z).sum(-1)
                    #prob = logprob.exp()
                    #broadcast = (prob > 0.001).float()
                    #import pdb; pdb.set_trace()
                    l2 = (z_team - prev_z).pow(2).sum(-1).sqrt()
                    broadcast = (l2 > 5).float()
                    mac.set_part_team_strategy(z_team, broadcast)
                    #import pdb; pdb.set_trace()
                    orders += (broadcast * mask).sum()
                    tt_orders += mask.sum()
                    prev_z = mac.z_team.clone()

            actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden, epsilon=0.)
            o, e, m, ms, r, d = step_wrapper(env, actions)
            episode_reward += r.sum()

        all_rewards.append(episode_reward)
    all_rewards = np.array(all_rewards)
    print(f"broadcast rate {orders/tt_orders}")
    print(f"mean reward {all_rewards.mean()} | std reward {all_rewards.std()}")
    return all_rewards.mean()
Example #2
0
def test_exp(config, fn, exp, threshold=0.):
    env = make_parallel_env(1, 9999, fn)
    update_config(env, config)
    config.method = exp
    k = exp.find("ctr")
    config.centralized_every = int(exp[k + 3:k + 4])
    if "165" in exp:
        config.agent_hidden_dim = 165
    else:
        config.agent_hidden_dim = 128

    if "coach" in exp:
        config.has_coach = True

    # setup modules
    mac = Agent(config)  # policy
    qlearner = QLearner(mac, config)

    R = []
    OR = []

    for run_num in tqdm([0, 1, 2, 3, 4]):
        model_path = f"/home/liub/Desktop/mount/teamstrategy/coach1/mpe/{exp}/run{run_num}"

        qlearner.load_models(model_path)
        qlearner.cuda()

        reward = 0
        n_orders = 0
        n_total_orders = 1e-12

        for n_ep in range(n_eval):
            o, e, c, m, ms = reset_wrapper(env)
            prev_a = torch.zeros(o.shape[0],
                                 o.shape[1]).long().to(config.device)
            rnn_hidden = mac.init_hidden(o.shape[0], o.shape[1])

            prev_z = None

            for t in range(145):
                if "full" in exp:
                    m = ms
                if "interval" in exp and t % config.centralized_every == 0:
                    m = ms
                o_, e_, c_, m_, ms_ = mac.tensorize(o, e, c, m, ms)

                if config.has_coach and t % config.centralized_every == 0:
                    ma = ms_.sum(-1).gt(0).float()
                    with torch.no_grad():
                        _, z_team, _ = qlearner.coach(o_, e_, c_, ms_)
                    if prev_z is None:
                        mac.set_team_strategy(z_team * ma.unsqueeze(-1))
                        prev_z = z_team
                        n_orders += ma.sum().item()
                        n_total_orders += ma.sum().item()
                    else:
                        bs, n = z_team.shape[:2]
                        #normal = D.Normal(z_team, (0.5*logvar).exp())
                        #logprob = normal.log_prob(prev_z).sum(-1)
                        #prob = logprob.exp()
                        #broadcast = (prob > 0.001).float()
                        #import pdb; pdb.set_trace()
                        l2 = (z_team * ma.unsqueeze(-1) -
                              prev_z * ma.unsqueeze(-1)).pow(2).sum(-1).sqrt()
                        broadcast = (l2 > threshold).float()
                        mac.set_part_team_strategy(z_team, broadcast)
                        #import pdb; pdb.set_trace()
                        n_orders += broadcast.sum().item()
                        n_total_orders += ma.sum().item()
                        prev_z = mac.z_team.clone()

                actions, rnn_hidden = mac.step(o_, e_, c_, m_, ms_, rnn_hidden,
                                               prev_a, 0)
                prev_a = torch.LongTensor(actions).to(config.device)
                o, e, m, ms, r, d = step_wrapper(env, actions)
                reward += r.sum()

        reward = reward / n_eval
        rate = n_orders / n_total_orders

        R.append(reward)
        OR.append(rate)

    R = np.array(R)
    OR = np.array(OR)
    print(
        f"{exp:30s}[{threshold:3d}] | muR: {R.mean():.4f} stdR: {R.std()/np.sqrt(5):.4f} | muC: {OR.mean():.4f} stdC: {OR.std()/np.sqrt(5):.4f}"
    )
    return R.mean(), R.std(), OR.mean(), OR.std()