Ejemplo n.º 1
0
def test_func(
    rank,
    E,
    T,
    args,
    test_q,
    device,
    tensorboard_dir,
):
    torch.manual_seed(args.seed + rank)
    np.random.seed(args.seed + rank)
    print("set up Test process env")
    opp = args.opp_list[rank]
    # non_station evaluation
    # if args.exp_name == "test":
    #     env = gym.make("CartPole-v0")
    # elif p2 == "Non-station":
    #     env = make_ftg_ram_nonstation(args.env, p2_list=args.list, total_episode=args.test_episode,
    #                                   stable=args.stable)
    # else:
    #     env = make_ftg_ram(args.env, p2=p2)
    # obs_dim = env.observation_space.shape[0]
    # act_dim = env.action_space.n
    env = SoccerPLUS()
    obs_dim = env.n_features
    act_dim = env.n_actions

    ac_kwargs = dict(hidden_sizes=[args.hid] * args.l)
    local_ac = MLPActorCritic(obs_dim, act_dim, **ac_kwargs)
    env.close()
    del env
    temp_dir = os.path.join(tensorboard_dir, "test_{}".format(opp))
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    writer = SummaryWriter(log_dir=temp_dir)
    # Main loop: collect experience in env and update/log each epoch
    while True:
        received_obj = test_q.get()
        (test_model, t) = received_obj
        print("TEST Process {} loaded new mode at {} step".format(rank, t))
        model_dict = deepcopy(test_model)
        local_ac.load_state_dict(model_dict)
        del received_obj
        # if args.exp_name == "test":
        #     env = gym.make("CartPole-v0")
        # elif p2 == "Non-station":
        #     env = make_ftg_ram_nonstation(args.env, p2_list=args.list, total_episode=args.test_episode,stable=args.stable)
        # else:
        #     env = make_ftg_ram(args.env, p2=p2)
        env = SoccerPLUS()
        print("TESTING process {} start to test, opp: {}".format(rank, opp))
        m_score, win_rate, steps = test_proc(local_ac, env, opp, args, device)
        test_summary(opp, steps, m_score, win_rate, writer, args, t)
        print("TESTING process {} finished, opp: {}".format(rank, opp))
        env.close()
        del env
        if t >= args.episode:
            break
    print("Process {}\tTester Ended".format(rank))
Ejemplo n.º 2
0
def main():
    # env = gym.make('CartPole-v0')
    # obs_dim = env.observation_space.shape[0]
    # act_dim = env.action_space.n

    env = SoccerPLUS(visual=False)
    obs_dim = env.n_features
    act_dim = env.n_actions
    learning_rate = 0.0001
    gamma = 0.98
    hidden = 256
    n_rollout = 10
    policy_type = 1
    opp_policy = Policy(game=env, player_num=False)
    model = ActorCritic(obs_dim, act_dim, hidden, learning_rate, gamma)

    # Training Loop
    print_interval = 100
    score = 0.0
    n_epi = 0
    while True:
        n_epi += 1
        done = False
        s = env.reset()
        while not done:
            for t in range(n_rollout):
                prob = model.pi(torch.from_numpy(s).float())
                m = Categorical(prob)
                a = m.sample().item()
                # s_prime, r, done, info = env.step(a)
                s_prime, r, done, info = env.step(
                    a, opp_policy.get_actions(policy_type))
                env.render()
                model.put_data((s, a, r, s_prime, done))

                s = s_prime
                score += r

                if done:
                    break

            model.train_net()

        if n_epi % print_interval == 0 and n_epi != 0:
            print("# of episode :{}, avg score : {:.1f}".format(
                n_epi, score / print_interval))
            score = 0.0
    env.close()
Ejemplo n.º 3
0
                         z_dim=args.z_dim,
                         c_dim=args.c_dim,
                         device=device)
    else:
        global_cpc = None
    # create shared model for actor
    global_ac_targ = deepcopy(global_ac)
    shared_ac = deepcopy(global_ac).cpu()
    # create optimizer
    pi_optimizer = Adam(global_ac.pi.parameters(), lr=args.lr, eps=1e-4)
    q1_optimizer = Adam(global_ac.q1.parameters(), lr=args.lr, eps=1e-4)
    q2_optimizer = Adam(global_ac.q2.parameters(), lr=args.lr, eps=1e-4)
    alpha_optim = Adam([global_ac.log_alpha], lr=args.lr, eps=1e-4)
    if args.cpc:
        cpc_optimizer = Adam(global_cpc.parameters(), lr=args.lr, eps=1e-4)
    env.close()
    del env

    # training setup
    T = Counter()  # training steps
    E = Counter()  # training episode
    replay_buffer = ReplayBufferOppo(obs_dim=obs_dim,
                                     max_size=args.replay_size,
                                     cpc=args.cpc,
                                     cpc_model=global_cpc,
                                     writer=writer,
                                     E=E)

    # bufferopp1 = ReplayBufferOppo(obs_dim=obs_dim, max_size=args.replay_size, cpc=args.cpc,
    #                                  cpc_model=global_cpc, writer=writer, E=E)