class Agent: def __init__(self, world_size, args): if args.env_name == 'L2M2019Env': env = L2M2019Env(visualize=False, difficulty=args.difficulty) obs_dim = 99 else: env = gym.make(args.env_name) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] self.device = torch.device(args.device) self.args = args self.world_size = world_size self.actor_critic = MLPActorCritic(obs_dim, act_dim, hidden_sizes=args.hidden_sizes).to( self.device) self.replay_buffer = [ ReplayBuffer(obs_dim, act_dim, args.buffer_size) for _ in range(1, world_size) ] self.gac = GAC(self.actor_critic, self.replay_buffer, device=self.device, gamma=args.gamma, alpha_start=args.alpha_start, alpha_min=args.alpha_min, alpha_max=args.alpha_max) self.test_len = 0.0 self.test_ret = 0.0 self.ob_rrefs = [] for ob_rank in range(1, world_size): ob_info = rpc.get_worker_info(OBSERVER_NAME.format(ob_rank)) self.ob_rrefs.append(remote(ob_info, Observer, args=(args, ))) self.agent_rref = RRef(self) def select_action(self, obs, deterministic=False): obs = torch.FloatTensor(obs.reshape(1, -1)).to(self.device) a = self.actor_critic.act(obs, deterministic) return a def add_memory(self, ob_id, o, a, r, o2, d): self.replay_buffer[ob_id - 1].store(o, a, r, o2, d) def run_episode(self, n_steps=0, random=False): futs = [] for ob_rref in self.ob_rrefs: # make async RPC to kick off an episode on all observers futs.append( rpc_async(ob_rref.owner(), _call_method, args=(Observer.run_episode, ob_rref, self.agent_rref, n_steps, random))) # wait until all obervers have finished this episode for fut in futs: fut.wait() def add_test_data(self, ret, length): self.test_ret += ret self.test_len += length def test_episode(self): futs, self.test_ret, self.test_len = [], 0.0, 0.0 for ob_rref in self.ob_rrefs: # make async RPC to kick off an episode on all observers futs.append( rpc_async(ob_rref.owner(), _call_method, args=(Observer.test_episode, ob_rref, self.agent_rref))) # wait until all obervers have finished this episode for fut in futs: fut.wait() self.test_ret /= (self.world_size - 1) self.test_len /= (self.world_size - 1) return self.test_ret, self.test_len def update(self): for _ in range(self.args.steps_per_update): loss_a, loss_c, alpha = self.gac.update(self.args.batch_size) self.gac.update_beta() print( "loss_actor = {:<22}, loss_critic = {:<22}, alpha = {:<20}, beta = {:<20}" .format(loss_a, loss_c, alpha, self.gac.beta))
def main(args): if 'L2M2019Env' in args.env_name: env = L2M2019Env(visualize=False, difficulty=args.difficulty) test_env = L2M2019Env(visualize=False, difficulty=args.difficulty) else: env = gym.make(args.env_name) test_env = gym.make(args.env_name) device = torch.device(args.device) data = np.load('./official_obs_scaler.npz') obs_mean, obs_std = data['mean'], data['std'] # 1.Set some necessary seed. torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) env.seed(args.seed) test_env.seed(args.seed + 999) # 2.Create actor, critic, EnvSampler() and PPO. if 'L2M2019Env' in args.env_name: obs_dim = 99 else: obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_high = env.action_space.high act_low = env.action_space.low actor_critic = MLPActorCritic(obs_dim, act_dim, hidden_sizes=args.hidden_sizes).to(device) replay_buffer = ReplayBuffer(obs_dim, act_dim, args.buffer_size) gac = GAC(actor_critic, replay_buffer, device=device, gamma=args.gamma, alpha_start=args.alpha_start, alpha_min=args.alpha_min, alpha_max=args.alpha_max) def act_encoder(y): # y = [min, max] ==> x = [-1, 1] # if args.env_name == 'L2M2019Env': # return y return (y - act_low) / (act_high - act_low) * 2.0 - 1.0 def act_decoder(x): # x = [-1, 1] ==> y = [min, max] # if args.env_name == 'L2M2019Env': # return np.abs(x) return (x + 1.0) / 2.0 * (act_high - act_low) - act_low def get_observation(env): obs = np.array(env.get_observation()[242:]) obs = (obs - obs_mean) / obs_std state_desc = env.get_state_desc() p_body = [ state_desc['body_pos']['pelvis'][0], -state_desc['body_pos']['pelvis'][2] ] v_body = [ state_desc['body_vel']['pelvis'][0], -state_desc['body_vel']['pelvis'][2] ] v_tgt = env.vtgt.get_vtgt(p_body).T return np.append(obs, v_tgt) def get_reward(env): reward = 10.0 # Reward for not falling down state_desc = env.get_state_desc() p_body = [ state_desc['body_pos']['pelvis'][0], -state_desc['body_pos']['pelvis'][2] ] v_body = [ state_desc['body_vel']['pelvis'][0], -state_desc['body_vel']['pelvis'][2] ] v_tgt = env.vtgt.get_vtgt(p_body).T vel_penalty = np.linalg.norm(v_body - v_tgt) muscle_penalty = 0 for muscle in sorted(state_desc['muscles'].keys()): muscle_penalty += np.square( state_desc['muscles'][muscle]['activation']) ret_r = reward - (vel_penalty * 3 + muscle_penalty * 1) if vel_penalty < 0.3: ret_r += 10 return ret_r # 3.Start training. def get_action(o, deterministic=False): o = torch.FloatTensor(o.reshape(1, -1)).to(device) a = actor_critic.act(o, deterministic) return a def test_agent(): test_ret, test_len = 0, 0 for j in range(args.epoch_per_test): _, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 o = get_observation(test_env) while not (d or (ep_len == args.max_ep_len)): # Take deterministic actions at test time a = get_action(o, True) a = act_decoder(a) for _ in range(args.frame_skip): _, r, d, _ = test_env.step(a) ep_ret += r ep_len += 1 if d: break o = get_observation(test_env) test_ret += ep_ret test_len += ep_len return test_ret / args.epoch_per_test, test_len / args.epoch_per_test total_step = args.total_epoch * args.steps_per_epoch _, d, ep_len = env.reset(), False, 0 o = get_observation(env) for t in range(1, total_step + 1): if t <= args.start_steps: a = act_encoder(env.action_space.sample()) else: a = get_action(o, deterministic=False) a = act_decoder(a) r = 0.0 for _ in range(args.frame_skip): _, _, d, _ = env.step(a) r += get_reward(env) ep_len += 1 if d: break o2 = get_observation(env) # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == args.max_ep_len else d # if not d: # new_o, new_r, new_o2 = generate_success(o, o2) # replay_buffer.store(new_o, a, new_r * args.reward_scale, new_o2, d) # Store experience to replay buffer replay_buffer.store(o, a, r * args.reward_scale, o2, d) o = o2 if d or (ep_len == args.max_ep_len): _, ep_len = env.reset(obs_as_dict=False), 0 o = get_observation(env) if t >= args.update_after and t % args.steps_per_update == 0: for _ in range(args.steps_per_update): loss_a, loss_c, alpha = gac.update(args.batch_size) gac.update_beta() print( "loss_actor = {:<22}, loss_critic = {:<22}, alpha = {:<20}, beta = {:<20}" .format(loss_a, loss_c, alpha, gac.beta)) # End of epoch handling if t >= args.update_after and t % args.steps_per_epoch == 0: test_ret, test_len = test_agent() print("Step {:>10}: test_ret = {:<20}, test_len = {:<20}".format( t, test_ret, test_len)) print( "-----------------------------------------------------------") yield t, test_ret, test_len, actor_critic