def main(): max_iteration = 5000 episodes_per_batch = 20 max_kl = 0.01 init_logvar = -1 policy_epochs = 5 value_epochs = 10 value_batch_size = 256 gamma = 0.995 lam = .97 # initialize environment env = HumanoidEnv() env.seed(0) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] logger = Logger() # init qpos and qvel init_qpos = np.load('./mocap_expert_qpos.npy') init_qvel = np.load('./mocap_expert_qvel.npy') # policy function policy = Policy(obs_dim=obs_dim, act_dim=act_dim, max_kl=max_kl, init_logvar=init_logvar, epochs=policy_epochs, logger=logger) session_to_restore = '/Users/sayvaz/Desktop/humanoid_gail_results/model_ego_inter/model_humanoid_ego_1700' stats_to_recover = '/Users/sayvaz/Desktop/humanoid_gail_results/model_ego_inter/stats_humanoid_ego_1700' scale, offset = policy.restore_session( session_to_restore=session_to_restore, stats_to_recover=stats_to_recover) # expert agent agent = ExpertAgent(env=env, policy_function=policy, scale=scale, offset=offset, init_qpos=init_qpos, init_qvel=init_qvel, logger=logger) agent.collect(episodes_per_batch=20) # close everything policy.close_session()
def _get_obs(self): obs = np.concatenate([ HumanoidEnv._get_obs(self), np.zeros(self.n_bins) # goal_readings ]) return obs
def main(): max_iteration = 3000 episodes_per_batch = 20 max_kl = 0.01 init_logvar = -1 policy_epochs = 5 value_epochs = 10 value_batch_size = 256 gamma = 0.995 lam = .97 exp_info = 'humanoid_ego_pure' # initialize environment env = HumanoidEnv() env.seed(0) obs_dim = env.observation_space.shape[0] ego_dim = env.ego_pure_shape() print('obs_dim: ', obs_dim) print('ego_dim: ', ego_dim) act_dim = env.action_space.shape[0] logger = Logger() killer = GracefulKiller() # init qpos and qvel init_qpos = np.load('./mocap_expert_qpos.npy') init_qvel = np.load('./mocap_expert_qvel.npy') exp_obs = np.load('./mocap_pure_ego.npy') print('exp_obs shape: ', exp_obs.shape) # policy function policy = Policy(obs_dim=obs_dim, act_dim=act_dim, max_kl=max_kl, init_logvar=init_logvar, epochs=policy_epochs, logger=logger) # value function value = Value(obs_dim=obs_dim, act_dim=act_dim, epochs=value_epochs, batch_size=value_batch_size, logger=logger) discriminator = Discriminator(obs_dim=ego_dim, act_dim=act_dim, ent_reg_weight=1e-3, epochs=2, input_type='states', loss_type='pure_gail', logger=logger) # agent agent = GeneratorAgentEgoPure(env=env, policy_function=policy, value_function=value, discriminator=discriminator, gamma=gamma, lam=lam, init_qpos=init_qpos, init_qvel=init_qvel, logger=logger) print('policy lr: %f' %policy.lr) print('value lr %f' %value.lr) print('disc lr: %f' %discriminator.lr) # train for num_episodes iteration = 0 while iteration < max_iteration: print('-------- iteration %d --------' %iteration) # collect trajectories obs, uns_obs, acts, tdlams, advs = agent.collect(timesteps=20000) # update policy function using ppo policy.update(obs, acts, advs) # update value function value.update(obs, tdlams) idx = np.random.randint(low=0, high=exp_obs.shape[0], size=uns_obs.shape[0]) expert = exp_obs[idx, :] gen_acc, exp_acc, total_acc = discriminator.update(exp_obs=expert, gen_obs=uns_obs) print('gen_acc: %f, exp_acc: %f, total_acc: %f' %(gen_acc, exp_acc, total_acc)) if iteration % 50 == 0: print('saving...') # save the experiment logs filename = './model_inter_ego_pure/stats_' + exp_info + '_' + str(iteration) logger.dump(filename) # save session filename = './model_inter_ego_pure/model_' + exp_info + '_' + str(iteration) policy.save_session(filename) if killer.kill_now: break # update episode number iteration += 1 # save the experiment logs filename = './model_ego_pure/stats_' + exp_info logger.dump(filename) # save session filename = './model_ego_pure/model_' + exp_info policy.save_session(filename) # close everything policy.close_session() value.close_session() env.close()
import tensorflow as tf import numpy as np import gym import multiprocessing as mp from policy import Policy from value import Value from agent import ExpertAgent from discriminator import Discriminator from utils import Logger import argparse import signal from gym.envs.mujoco.humanoid import HumanoidEnv from sklearn.utils import shuffle env = HumanoidEnv() print(env.observation_space.shape[0]) print(env.action_space.shape[0]) qpos = np.load('./mocap_expert_qpos.npy') qvel = np.load('./mocap_expert_qvel.npy') print(qpos.shape[1]) obs = [] for i in range(qpos.shape[1]): env.set_state(qpos[:, i], qvel[:, i]) obs.append(env.get_pure_egocentric()) obs = np.array(obs) print(obs.shape) np.save('mocap_pure_ego', obs)