Beispiel #1
0
class Agent:
    def __init__(self, world_size, args):
        if args.env_name == 'L2M2019Env':
            env = L2M2019Env(visualize=False, difficulty=args.difficulty)
            obs_dim = 99
        else:
            env = gym.make(args.env_name)
            obs_dim = env.observation_space.shape[0]

        act_dim = env.action_space.shape[0]

        self.device = torch.device(args.device)

        self.args = args
        self.world_size = world_size

        self.actor_critic = MLPActorCritic(obs_dim,
                                           act_dim,
                                           hidden_sizes=args.hidden_sizes).to(
                                               self.device)
        self.replay_buffer = [
            ReplayBuffer(obs_dim, act_dim, args.buffer_size)
            for _ in range(1, world_size)
        ]

        self.gac = GAC(self.actor_critic,
                       self.replay_buffer,
                       device=self.device,
                       gamma=args.gamma,
                       alpha_start=args.alpha_start,
                       alpha_min=args.alpha_min,
                       alpha_max=args.alpha_max)

        self.test_len = 0.0
        self.test_ret = 0.0

        self.ob_rrefs = []
        for ob_rank in range(1, world_size):
            ob_info = rpc.get_worker_info(OBSERVER_NAME.format(ob_rank))
            self.ob_rrefs.append(remote(ob_info, Observer, args=(args, )))

        self.agent_rref = RRef(self)

    def select_action(self, obs, deterministic=False):
        obs = torch.FloatTensor(obs.reshape(1, -1)).to(self.device)
        a = self.actor_critic.act(obs, deterministic)
        return a

    def add_memory(self, ob_id, o, a, r, o2, d):
        self.replay_buffer[ob_id - 1].store(o, a, r, o2, d)

    def run_episode(self, n_steps=0, random=False):
        futs = []
        for ob_rref in self.ob_rrefs:
            # make async RPC to kick off an episode on all observers
            futs.append(
                rpc_async(ob_rref.owner(),
                          _call_method,
                          args=(Observer.run_episode, ob_rref, self.agent_rref,
                                n_steps, random)))

        # wait until all obervers have finished this episode
        for fut in futs:
            fut.wait()

    def add_test_data(self, ret, length):
        self.test_ret += ret
        self.test_len += length

    def test_episode(self):
        futs, self.test_ret, self.test_len = [], 0.0, 0.0
        for ob_rref in self.ob_rrefs:
            # make async RPC to kick off an episode on all observers
            futs.append(
                rpc_async(ob_rref.owner(),
                          _call_method,
                          args=(Observer.test_episode, ob_rref,
                                self.agent_rref)))

        # wait until all obervers have finished this episode
        for fut in futs:
            fut.wait()

        self.test_ret /= (self.world_size - 1)
        self.test_len /= (self.world_size - 1)
        return self.test_ret, self.test_len

    def update(self):
        for _ in range(self.args.steps_per_update):
            loss_a, loss_c, alpha = self.gac.update(self.args.batch_size)
        self.gac.update_beta()
        print(
            "loss_actor = {:<22}, loss_critic = {:<22}, alpha = {:<20}, beta = {:<20}"
            .format(loss_a, loss_c, alpha, self.gac.beta))
Beispiel #2
0
def main(args):

    if 'L2M2019Env' in args.env_name:
        env = L2M2019Env(visualize=False, difficulty=args.difficulty)
        test_env = L2M2019Env(visualize=False, difficulty=args.difficulty)
    else:
        env = gym.make(args.env_name)
        test_env = gym.make(args.env_name)
    device = torch.device(args.device)

    data = np.load('./official_obs_scaler.npz')
    obs_mean, obs_std = data['mean'], data['std']

    # 1.Set some necessary seed.
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)
    test_env.seed(args.seed + 999)

    # 2.Create actor, critic, EnvSampler() and PPO.
    if 'L2M2019Env' in args.env_name:
        obs_dim = 99
    else:
        obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    act_high = env.action_space.high
    act_low = env.action_space.low

    actor_critic = MLPActorCritic(obs_dim,
                                  act_dim,
                                  hidden_sizes=args.hidden_sizes).to(device)

    replay_buffer = ReplayBuffer(obs_dim, act_dim, args.buffer_size)

    gac = GAC(actor_critic,
              replay_buffer,
              device=device,
              gamma=args.gamma,
              alpha_start=args.alpha_start,
              alpha_min=args.alpha_min,
              alpha_max=args.alpha_max)

    def act_encoder(y):
        # y = [min, max] ==> x = [-1, 1]
        # if args.env_name == 'L2M2019Env':
        #     return y
        return (y - act_low) / (act_high - act_low) * 2.0 - 1.0

    def act_decoder(x):
        # x = [-1, 1] ==> y = [min, max]
        # if args.env_name == 'L2M2019Env':
        #     return np.abs(x)
        return (x + 1.0) / 2.0 * (act_high - act_low) - act_low

    def get_observation(env):
        obs = np.array(env.get_observation()[242:])

        obs = (obs - obs_mean) / obs_std

        state_desc = env.get_state_desc()
        p_body = [
            state_desc['body_pos']['pelvis'][0],
            -state_desc['body_pos']['pelvis'][2]
        ]
        v_body = [
            state_desc['body_vel']['pelvis'][0],
            -state_desc['body_vel']['pelvis'][2]
        ]
        v_tgt = env.vtgt.get_vtgt(p_body).T

        return np.append(obs, v_tgt)

    def get_reward(env):
        reward = 10.0

        # Reward for not falling down
        state_desc = env.get_state_desc()
        p_body = [
            state_desc['body_pos']['pelvis'][0],
            -state_desc['body_pos']['pelvis'][2]
        ]
        v_body = [
            state_desc['body_vel']['pelvis'][0],
            -state_desc['body_vel']['pelvis'][2]
        ]
        v_tgt = env.vtgt.get_vtgt(p_body).T

        vel_penalty = np.linalg.norm(v_body - v_tgt)

        muscle_penalty = 0
        for muscle in sorted(state_desc['muscles'].keys()):
            muscle_penalty += np.square(
                state_desc['muscles'][muscle]['activation'])

        ret_r = reward - (vel_penalty * 3 + muscle_penalty * 1)

        if vel_penalty < 0.3:
            ret_r += 10

        return ret_r

    # 3.Start training.
    def get_action(o, deterministic=False):
        o = torch.FloatTensor(o.reshape(1, -1)).to(device)
        a = actor_critic.act(o, deterministic)
        return a

    def test_agent():
        test_ret, test_len = 0, 0
        for j in range(args.epoch_per_test):
            _, d, ep_ret, ep_len = test_env.reset(), False, 0, 0
            o = get_observation(test_env)
            while not (d or (ep_len == args.max_ep_len)):
                # Take deterministic actions at test time
                a = get_action(o, True)
                a = act_decoder(a)

                for _ in range(args.frame_skip):
                    _, r, d, _ = test_env.step(a)
                    ep_ret += r
                    ep_len += 1
                    if d: break

                o = get_observation(test_env)

            test_ret += ep_ret
            test_len += ep_len
        return test_ret / args.epoch_per_test, test_len / args.epoch_per_test

    total_step = args.total_epoch * args.steps_per_epoch
    _, d, ep_len = env.reset(), False, 0
    o = get_observation(env)
    for t in range(1, total_step + 1):
        if t <= args.start_steps:
            a = act_encoder(env.action_space.sample())
        else:
            a = get_action(o, deterministic=False)

        a = act_decoder(a)

        r = 0.0
        for _ in range(args.frame_skip):
            _, _, d, _ = env.step(a)
            r += get_reward(env)
            ep_len += 1
            if d: break

        o2 = get_observation(env)

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)

        d = False if ep_len == args.max_ep_len else d

        # if not d:
        #     new_o, new_r, new_o2 = generate_success(o, o2)
        #     replay_buffer.store(new_o, a, new_r * args.reward_scale, new_o2, d)

        # Store experience to replay buffer
        replay_buffer.store(o, a, r * args.reward_scale, o2, d)

        o = o2
        if d or (ep_len == args.max_ep_len):
            _, ep_len = env.reset(obs_as_dict=False), 0
            o = get_observation(env)

        if t >= args.update_after and t % args.steps_per_update == 0:
            for _ in range(args.steps_per_update):
                loss_a, loss_c, alpha = gac.update(args.batch_size)
            gac.update_beta()
            print(
                "loss_actor = {:<22}, loss_critic = {:<22}, alpha = {:<20}, beta = {:<20}"
                .format(loss_a, loss_c, alpha, gac.beta))

        # End of epoch handling
        if t >= args.update_after and t % args.steps_per_epoch == 0:
            test_ret, test_len = test_agent()
            print("Step {:>10}: test_ret = {:<20}, test_len = {:<20}".format(
                t, test_ret, test_len))
            print(
                "-----------------------------------------------------------")
            yield t, test_ret, test_len, actor_critic