Esempio n. 1
0
def test_func(
    test_q,
    rank,
    E,
    p2,
    args,
    device,
    tensorboard_dir,
):
    torch.manual_seed(args.seed + rank)
    np.random.seed(args.seed + rank)
    print("set up Test process env")
    temp_dir = os.path.join(tensorboard_dir, "test_{}".format(p2))
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    writer = SummaryWriter(log_dir=temp_dir)
    # non_station evaluation
    if args.exp_name == "test":
        env = gym.make("CartPole-v0")
    elif p2 == "Non-station":
        env = make_ftg_ram_nonstation(args.env,
                                      p2_list=args.list,
                                      total_episode=args.test_episode,
                                      stable=args.stable)
    else:
        env = make_ftg_ram(args.env, p2=p2)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    ac_kwargs = dict(hidden_sizes=[args.hid] * args.l)
    if args.cpc:
        local_ac = MLPActorCritic(obs_dim + args.c_dim, act_dim, **ac_kwargs)
    else:
        local_ac = MLPActorCritic(obs_dim, act_dim, **ac_kwargs)
    env.close()
    del env
    # Main loop: collect experience in env and update/log each epoch
    while E.value() <= args.episode:
        received_obj = test_q.get()
        e = E.value()
        print("TEST Process {} loaded new mode".format(rank))
        model_dict = deepcopy(received_obj)
        local_ac.load_state_dict(model_dict)
        del received_obj
        if args.exp_name == "test":
            env = gym.make("CartPole-v0")
        elif p2 == "Non-station":
            env = make_ftg_ram_nonstation(args.env,
                                          p2_list=args.list,
                                          total_episode=args.test_episode,
                                          stable=args.stable)
        else:
            env = make_ftg_ram(args.env, p2=p2)
        print("TESTING process {} start to test, opp: {}".format(rank, p2))
        m_score, win_rate, steps = test_proc(local_ac, env, args, device)
        test_summary(p2, steps, m_score, win_rate, writer, args, e)
        env.close()
        del env
        print("TESTING process {} finished, opp: {}".format(rank, p2))
Esempio n. 2
0
def test_func(
    global_ac,
    rank,
    e,
    p2,
    args,
    device,
    tensorboard_dir,
):
    torch.manual_seed(args.seed + rank)
    np.random.seed(args.seed + rank)
    print("set up Test process env")
    temp_dir = os.path.join(tensorboard_dir, "test_{}".format(p2))
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)
    writer = SummaryWriter(log_dir=temp_dir)

    # Main loop: collect experience in env and update/log each epoch
    if args.exp_name == "test":
        env = gym.make("CartPole-v0")
    elif p2 == "Non-station":
        env = make_ftg_ram_nonstation(args.env,
                                      p2_list=args.list,
                                      total_episode=args.test_episode,
                                      stable=args.stable)
    else:
        env = make_ftg_ram(args.env, p2=p2)
    print("TESTING process {} start to test, opp: {}".format(rank, p2))
    m_score, win_rate, steps = test_proc(global_ac, env, args, device)
    test_summary(p2, steps, m_score, win_rate, writer, args, e)
    env.close()
    del env
    print("TESTING process {} finished, opp: {}".format(rank, p2))
Esempio n. 3
0
 scores, win_rates, rounds = [
     [],
     [],
     [],
     [],
 ], [[], [], [], []], [[], [], [], []]
 for e in range(2):
     global_ac.load_state_dict(
         torch.load(os.path.join(experiment_dir, model_para[e])))
     global_ac.share_memory()
     for index, p2 in enumerate(p2_list):
         if args.exp_name == "test":
             env = gym.make("CartPole-v0")
         elif p2 == "Non-station":
             env = make_ftg_ram_nonstation(args.env,
                                           p2_list=args.list,
                                           total_episode=args.test_episode,
                                           stable=args.stable)
         else:
             env = make_ftg_ram(args.env, p2=p2)
         m_score, win_rate, steps = test_proc(global_ac, env, args,
                                              torch.device("cpu"))
         scores[index].append(m_score)
         win_rates[index].append(win_rate)
         rounds[index].append(e * 100)
         env.close()
         del env
     print("First Round finished")
     torch.save((
         scores,
         win_rates,
         rounds,
Esempio n. 4
0
def sac_opp(
    global_ac,
    global_ac_targ,
    global_cpc,
    rank,
    T,
    E,
    args,
    scores,
    wins,
    buffer,
    device=None,
    tensorboard_dir=None,
):
    torch.manual_seed(args.seed + rank)
    np.random.seed(args.seed + rank)
    # writer = GlobalSummaryWriter.getSummaryWriter()
    tensorboard_dir = os.path.join(tensorboard_dir, str(rank))
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)
    writer = SummaryWriter(log_dir=tensorboard_dir)
    if args.exp_name == "test":
        env = gym.make("CartPole-v0")
    elif args.non_station:
        env = make_ftg_ram_nonstation(args.env,
                                      p2_list=args.list,
                                      total_episode=args.station_rounds,
                                      stable=args.stable)
    else:
        env = make_ftg_ram(args.env, p2=args.p2)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    print("set up child process env")
    local_ac = MLPActorCritic(obs_dim + args.c_dim, act_dim,
                              **dict(hidden_sizes=[args.hid] *
                                     args.l)).to(device)
    local_ac.load_state_dict(global_ac.state_dict())
    print("local ac load global ac")

    # Freeze target networks with respect to optimizers (only update via polyak averaging)
    # Async Version
    for p in global_ac_targ.parameters():
        p.requires_grad = False

    # Experience buffer
    if args.cpc:
        replay_buffer = ReplayBufferOppo(obs_dim=obs_dim,
                                         max_size=args.replay_size,
                                         encoder=global_cpc)
    else:
        replay_buffer = ReplayBuffer(obs_dim=obs_dim, size=args.replay_size)

    # Entropy Tuning
    target_entropy = -torch.prod(
        torch.Tensor(env.action_space.shape).to(
            device)).item()  # heuristic value from the paper
    alpha = max(local_ac.log_alpha.exp().item(),
                args.min_alpha) if not args.fix_alpha else args.min_alpha

    # Set up optimizers for policy and q-function
    # Async Version
    pi_optimizer = Adam(global_ac.pi.parameters(), lr=args.lr, eps=1e-4)
    q1_optimizer = Adam(global_ac.q1.parameters(), lr=args.lr, eps=1e-4)
    q2_optimizer = Adam(global_ac.q2.parameters(), lr=args.lr, eps=1e-4)
    cpc_optimizer = Adam(global_cpc.parameters(), lr=args.lr, eps=1e-4)
    alpha_optim = Adam([global_ac.log_alpha], lr=args.lr, eps=1e-4)

    # Prepare for interaction with environment
    o, ep_ret, ep_len = env.reset(), 0, 0
    if args.cpc:
        c_hidden = global_cpc.init_hidden(1, args.c_dim, use_gpu=args.cuda)
        c1, c_hidden = global_cpc.predict(o, c_hidden)
        assert len(c1.shape) == 3
        c1 = c1.flatten().cpu().numpy()
        all_embeddings = []
        meta = []
    trajectory = list()
    p2 = env.p2
    p2_list = [str(p2)]
    discard = False
    uncertainties = []
    local_t, local_e = 0, 0
    t = T.value()
    e = E.value()
    glod_input = list()
    glod_target = list()
    # Main loop: collect experience in env and update/log each epoch
    while e <= args.episode:
        with torch.no_grad():
            # Until start_steps have elapsed, randomly sample actions
            # from a uniform distribution for better exploration. Afterwards,
            # use the learned policy.
            if t > args.start_steps:
                if args.cpc:
                    a = local_ac.get_action(np.concatenate((o, c1), axis=0),
                                            device=device)
                    a_prob = local_ac.act(
                        torch.as_tensor(np.expand_dims(np.concatenate((o, c1),
                                                                      axis=0),
                                                       axis=0),
                                        dtype=torch.float32,
                                        device=device))
                else:
                    a = local_ac.get_action(o, greedy=True, device=device)
                    a_prob = local_ac.act(
                        torch.as_tensor(np.expand_dims(o, axis=0),
                                        dtype=torch.float32,
                                        device=device))
            else:
                a = env.action_space.sample()
                a_prob = local_ac.act(
                    torch.as_tensor(np.expand_dims(o, axis=0),
                                    dtype=torch.float32,
                                    device=device))
        uncertainty = ood_scores(a_prob).item()

        # Step the env
        o2, r, d, info = env.step(a)
        if info.get('no_data_receive', False):
            discard = True
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        d = False if (ep_len == args.max_ep_len) or discard else d
        glod_input.append(o), glod_target.append(a)

        if args.cpc:
            # changed the trace structure for further analysis
            c2, c_hidden = global_cpc.predict(o2, c_hidden)
            assert len(c2.shape) == 3
            c2 = c2.flatten().cpu().numpy()
            replay_buffer.store(np.concatenate((o, c1), axis=0), a, r,
                                np.concatenate((o2, c2), axis=0), d)
            trajectory.append([o, a, r, o2, d, c1, c2, ep_len])
            all_embeddings.append(c1)
            meta.append([env.p2, local_e, ep_len, r, a, uncertainty])
            c1 = c2
            trajectory.append([o, a, r, o2, d, c1, c2])
        else:
            replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2
        T.increment()
        t = T.value()
        local_t += 1

        # End of trajectory handling
        if d or (ep_len == args.max_ep_len) or discard:
            replay_buffer.store(trajectory)
            E.increment()
            e = E.value()
            local_e += 1
            # logger.store(EpRet=ep_ret, EpLen=ep_len)
            if info.get('win', False):
                wins.append(1)
            else:
                wins.append(0)
            scores.append(ep_ret)
            m_score = np.mean(scores[-100:])
            win_rate = np.mean(wins[-100:])
            print(
                "Process {}, opponent:{}, # of global_episode :{},  # of global_steps :{}, round score: {}, mean score : {:.1f}, win_rate:{}, steps: {}, alpha: {}"
                .format(rank, args.p2, e, t, ep_ret, m_score, win_rate, ep_len,
                        alpha))
            writer.add_scalar("metrics/round_score", ep_ret, e)
            writer.add_scalar("metrics/mean_score", m_score.item(), e)
            writer.add_scalar("metrics/win_rate", win_rate.item(), e)
            writer.add_scalar("metrics/round_step", ep_len, e)
            writer.add_scalar("metrics/alpha", alpha, e)

            # CPC update handing
            if local_e > args.batch_size and local_e % args.update_every == 0 and args.cpc:
                data, indexes, min_len = replay_buffer.sample_traj(
                    args.batch_size)
                global_cpc.train()
                cpc_optimizer.zero_grad()
                c_hidden = global_cpc.init_hidden(len(data),
                                                  args.c_dim,
                                                  use_gpu=args.cuda)
                acc, loss, latents = global_cpc(data, c_hidden)

                replay_buffer.update_latent(indexes, min_len, latents.detach())
                loss.backward()
                # add gradient clipping
                nn.utils.clip_grad_norm_(global_cpc.parameters(), 20)
                cpc_optimizer.step()

                writer.add_scalar("training/acc", acc, e)
                writer.add_scalar("training/cpc_loss", loss.detach().item(), e)

                all_embeddings = np.array(all_embeddings)
                writer.add_embedding(mat=all_embeddings,
                                     metadata=meta,
                                     metadata_header=[
                                         "opponent", "round", "step", "reward",
                                         "action", "uncertainty"
                                     ])
                c_hidden = global_cpc.init_hidden(1,
                                                  args.c_dim,
                                                  use_gpu=args.cuda)
            o, ep_ret, ep_len = env.reset(), 0, 0
            trajectory = list()
            discard = False

        # OOD update stage
        if (t >= args.ood_update_step and local_t % args.ood_update_step == 0
                or replay_buffer.is_full()) and args.ood:
            # used all the data collected from the last args.ood_update_steps as the train data
            print("Conduct OOD updating")
            ood_train = (glod_input, glod_target)
            glod_model = convert_to_glod(global_ac.pi,
                                         train_loader=ood_train,
                                         hidden_dim=args.hid,
                                         act_dim=act_dim,
                                         device=device)
            glod_scores = retrieve_scores(
                glod_model,
                replay_buffer.obs_buf[:replay_buffer.size],
                device=device,
                k=args.ood_K)
            glod_scores = glod_scores.detach().cpu().numpy()
            print(len(glod_scores))
            writer.add_histogram(values=glod_scores,
                                 max_bins=300,
                                 global_step=local_t,
                                 tag="OOD")
            drop_points = np.percentile(
                a=glod_scores, q=[args.ood_drop_lower, args.ood_drop_upper])
            lower, upper = drop_points[0], drop_points[1]
            print(lower, upper)
            mask = np.logical_and((glod_scores >= lower),
                                  (glod_scores <= upper))
            reserved_indexes = np.argwhere(mask).flatten()
            print(len(reserved_indexes))
            if len(reserved_indexes) > 0:
                replay_buffer.ood_drop(reserved_indexes)
                glod_input = list()
                glod_target = list()

        # SAC Update handling
        if local_t >= args.update_after and local_t % args.update_every == 0:
            for j in range(args.update_every):

                batch = replay_buffer.sample_trans(batch_size=args.batch_size,
                                                   device=device)
                # First run one gradient descent step for Q1 and Q2
                q1_optimizer.zero_grad()
                q2_optimizer.zero_grad()
                loss_q = local_ac.compute_loss_q(batch, global_ac_targ,
                                                 args.gamma, alpha)
                loss_q.backward()

                # Next run one gradient descent step for pi.
                pi_optimizer.zero_grad()
                loss_pi, entropy = local_ac.compute_loss_pi(batch, alpha)
                loss_pi.backward()

                alpha_optim.zero_grad()
                alpha_loss = -(local_ac.log_alpha *
                               (entropy + target_entropy).detach()).mean()
                alpha_loss.backward(retain_graph=False)
                alpha = max(
                    local_ac.log_alpha.exp().item(),
                    args.min_alpha) if not args.fix_alpha else args.min_alpha

                nn.utils.clip_grad_norm_(local_ac.parameters(), 20)
                for global_param, local_param in zip(global_ac.parameters(),
                                                     local_ac.parameters()):
                    global_param._grad = local_param.grad

                pi_optimizer.step()
                q1_optimizer.step()
                q2_optimizer.step()
                alpha_optim.step()

                state_dict = global_ac.state_dict()
                local_ac.load_state_dict(state_dict)

                # Finally, update target networks by polyak averaging.
                with torch.no_grad():
                    for p, p_targ in zip(global_ac.parameters(),
                                         global_ac_targ.parameters()):
                        p_targ.data.copy_((1 - args.polyak) * p.data +
                                          args.polyak * p_targ.data)

                writer.add_scalar("training/pi_loss",
                                  loss_pi.detach().item(), t)
                writer.add_scalar("training/q_loss", loss_q.detach().item(), t)
                writer.add_scalar("training/alpha_loss",
                                  alpha_loss.detach().item(), t)
                writer.add_scalar("training/entropy",
                                  entropy.detach().mean().item(), t)

        if t % args.save_freq == 0 and t > 0:
            torch.save(
                global_ac.state_dict(),
                os.path.join(args.save_dir, args.exp_name, args.model_para))
            torch.save(
                global_cpc.state_dict(),
                os.path.join(args.save_dir, args.exp_name, args.cpc_para))
            state_dict_trans(
                global_ac.state_dict(),
                os.path.join(args.save_dir, args.exp_name, args.numpy_para))
            torch.save((e, t, list(scores), list(wins)),
                       os.path.join(args.save_dir, args.exp_name,
                                    args.train_indicator))
            print("Saving model at episode:{}".format(t))
Esempio n. 5
0
def sac(
    rank,
    E,
    args,
    model_q,
    buffer_q,
    device=None,
    tensorboard_dir=None,
):
    torch.manual_seed(args.seed + rank)
    np.random.seed(args.seed + rank)
    # writer = GlobalSummaryWriter.getSummaryWriter()
    tensorboard_dir = os.path.join(tensorboard_dir, str(rank))
    if not os.path.exists(tensorboard_dir):
        os.makedirs(tensorboard_dir)
    writer = SummaryWriter(log_dir=tensorboard_dir)
    if args.exp_name == "test":
        env = gym.make("CartPole-v0")
    elif args.non_station:
        env = make_ftg_ram_nonstation(args.env,
                                      p2_list=args.list,
                                      total_episode=args.station_rounds,
                                      stable=args.stable)
    else:
        env = make_ftg_ram(args.env, p2=args.p2)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    ac_kwargs = dict(hidden_sizes=[args.hid] * args.l)
    local_ac = MLPActorCritic(obs_dim, act_dim, **ac_kwargs)
    print("set up child process env")

    # Prepare for interaction with environment
    scores, wins = [], []
    # meta data is purely for experiment analysis
    trajectory, meta = [], []
    o, ep_ret, ep_len = env.reset(), 0, 0
    discard = False
    local_t, local_e = 0, 0
    if not model_q.empty():
        print("Process {}\t Initially LOADING...".format(rank))
        received_obj = model_q.get()
        model_dict = deepcopy(received_obj)
        local_ac.load_state_dict(model_dict)
        print("Process {}\t Initially Loading FINISHED!!!".format(rank))
        del received_obj
    # Main loop: collect experience in env and update/log each epoch
    while E.value() <= args.episode:
        with torch.no_grad():
            if E.value() <= args.update_after:
                a = np.random.randint(act_dim)
            else:
                a = local_ac.get_action(o, device=device)

        # print(o)
        # Step the env
        o2, r, d, info = env.step(a)
        if info.get('no_data_receive', False):
            discard = True
        ep_ret += r
        ep_len += 1

        d = False if (ep_len == args.max_ep_len) or discard else d
        # send the transition to main process
        if hasattr(env, 'p2'):
            opp = env.p2
        else:
            opp = None
        transition = (o, a, r, o2, d)
        trajectory.append(transition)
        meta.append([opp, rank, local_e, ep_len, r, a])
        o = o2
        local_t += 1
        # End of trajectory handling
        if d or (ep_len == args.max_ep_len) or discard:
            e = E.value()
            send_data = (trajectory, meta)
            buffer_q.put(send_data, )
            local_e += 1
            # logger.store(EpRet=ep_ret, EpLen=ep_len)
            if info.get('win', False):
                wins.append(1)
            else:
                wins.append(0)
            scores.append(ep_ret)
            m_score = np.mean(scores[-100:])
            win_rate = np.mean(wins[-100:])
            print(
                "Process\t{}\topponent:{},\t# of local episode :{},\tglobal episode {}\tround score: {},\tmean score : {:.1f},\twin rate:{},\tsteps: {}"
                .format(rank, opp, local_e, e, ep_ret, m_score, win_rate,
                        ep_len))
            writer.add_scalar("actor/round_score", ep_ret, local_e)
            writer.add_scalar("actor/mean_score", m_score.item(), local_e)
            writer.add_scalar("actor/win_rate", win_rate.item(), local_e)
            writer.add_scalar("actor/round_step", ep_len, local_e)
            writer.add_scalar("actor/learner_actor_speed", e, local_e)
            o, ep_ret, ep_len = env.reset(), 0, 0
            discard = False
            trajectory, meta = list(), list()
            if not model_q.empty():
                print(
                    "Process {}\tLOADING model at Global\t{},local\t{} EPISODE..."
                    .format(rank, e, local_e))
                received_obj = model_q.get()
                model_dict = deepcopy(received_obj)
                local_ac.load_state_dict(model_dict)
                print("Process {}\tLOADED new mode at Global\t{},local\t{}!!!".
                      format(rank, e, local_e))
                del received_obj