def _set_env_vars(self): env = self.make_env(0) self.ob_space, self.ac_space = env.observation_space, env.action_space self.ob_mean, self.ob_std = random_agent_ob_mean_std(env) del env self.envs = [ partial(self.make_env, i) for i in range(self.envs_per_process) ]
def _set_env_vars(self, hps): if True: self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std( None, hps['env'], nsteps=1, load=True) else: env = self.make_env(0, add_monitor=False, sleep_multiple=0) self.ob_space, self.ac_space = env.observation_space, env.action_space # self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std(env, hps['env']) self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std( env, hps['env'], nsteps=100 * 100, force_reset=100) # self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std(env, hps['env'], nsteps=10) env.close() del env self.envs = [ functools.partial(self.make_env, i + 1) for i in range(self.envs_per_process) ]
def _set_env_vars(self): env = self.make_env(0, add_monitor=False) self.ob_space, self.ac_space = env.observation_space, env.action_space self.ob_mean, self.ob_std = random_agent_ob_mean_std(env, nsteps=10000) del env self.envs = [ functools.partial(self.make_env, i) for i in range(self.envs_per_process) ]
def _set_env_vars(self): import numpy as np env = self.make_env(0, add_monitor=False) self.ob_space, self.ac_space = env.observation_space, env.action_space self.env_ob_space = env.observation_space if self.depth_pred: self.ob_space = gym.spaces.Box(0, 255, shape=(84,84,3), dtype=np.uint8) self.ob_mean, self.ob_std = random_agent_ob_mean_std(env, depth_pred=self.hps['depth_pred']) del env self.envs = [functools.partial(self.make_env, i) for i in range(self.envs_per_process)]
def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std( None, hps['env'], nsteps=1, load=True) self.env = self.make_env(258) self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['feat_learning']] self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics( auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512) self.agents = [ # self.create_agent('presub095', hps), self.create_agent('presub089', hps), # self.create_agent('presub088', hps), # self.create_agent('presub087', hps), # self.create_agent('presub047', hps), # self.create_agent('presub018', hps), # self.create_agent('presub001', hps), # self.create_agent('presub002', hps), # self.create_agent('presub004', hps), # self.create_agent('presub005', hps), # self.create_agent('presub015', hps), # self.create_agent('presub016', hps), # self.create_agent('presub017', hps), # self.create_agent('presub019', hps), # self.create_agent('presub020', hps), # self.create_agent('presub021', hps), ]
def _set_env_vars(self): env = self.make_env(0, add_monitor=False) assert self.hps[ 'env_kind'] == "dm_suite" #Current code only runs with dm_suite, because of added action space min and max for clipping self.ac_space_min, self.ac_space_max = env.ac_space_min, env.ac_space_max self.ob_space, self.ac_space = env.observation_space, env.action_space self.ob_mean, self.ob_std = random_agent_ob_mean_std(env) del env self.envs = [ functools.partial(self.make_env, i) for i in range(self.envs_per_process) ]
def _set_env_vars(self): env = self.make_env(0, add_monitor=False) self.ob_space, self.ac_space = env.observation_space, env.action_space self.ob_mean, self.ob_std = random_agent_ob_mean_std(env) if self.hps["env_kind"] == "unity": env.close() # self.ob_mean, self.ob_std = 124.89177, 55.7459 del env self.envs = [ functools.partial(self.make_env, i) for i in range(self.envs_per_process) ]
def _set_env_vars(self): from time import sleep env = self.make_env(0, add_monitor=False) self.ob_space, self.ac_space = env.observation_space, env.action_space self.ob_mean, self.ob_std = random_agent_ob_mean_std(env) env.close() print("Waiting for 1 minute to make sure socket is closed on Linux") sleep(60) del env self.envs = [ functools.partial(self.make_env, i) for i in range(self.envs_per_process) ]
def _set_env_vars(self): """ 该 env 仅是为了初始化 ob_space, ac_space, ob_mean, ob_std. 因此在算完之后 del 掉. 随后初始化 self.envs_per_process 个 env """ env = self.make_env(0, add_monitor=False) # ob_space.shape=(84, 84, 4) ac_space.shape=Discrete(4) self.ob_space, self.ac_space = env.observation_space, env.action_space # 随机智能体与环境交互, 计算观测的均值和标准差. ob_mean.shape=(84,84,4), 是0-255之间的数. ob_std是标量, breakout中为 1.8 self.ob_mean, self.ob_std = random_agent_ob_mean_std(env) del env self.envs = [ functools.partial(self.make_env, i) for i in range(self.envs_per_process) ]
def _set_env_vars(self): print("Making env") env = self.make_env(0, add_monitor=False) print("Done make env") self.ob_space, self.ac_space = env.observation_space, env.action_space print("Got obs and action space") self.ob_mean, self.ob_std = random_agent_ob_mean_std(env, 1000) print("Got observations") del env print("Deleted env, making partial funcs") self.envs = [ functools.partial(self.make_env, i) for i in range(self.envs_per_process) ] print("Envs:", self.envs)
env = ProcessFrame84(env, crop=False) # 处理观测 env = FrameStack(env, 4) # 将连续4帧叠加起来作为输入 return env make_env = partial(make_env_all_params, add_monitor=True) # make env env = make_env(0, add_monitor=False) obs = env.reset() print("obs and action space:", np.asarray(obs).shape, env.action_space.sample()) ob_space, ac_space = env.observation_space, env.action_space # 一个随机智能体与环境交互, 计算得到的观测的均值和标准差. from utils import random_agent_ob_mean_std ob_mean, ob_std = random_agent_ob_mean_std(env) print("obs mean:", ob_mean.shape, np.max(ob_mean), np.min(ob_mean)) print("obs std:", ob_std.shape, np.max(ob_std), np.min(ob_std)) # 初始化环境 envs = [partial(make_env, i) for i in range(5)] # CNN policy print("Init Policy.") policy = CnnPolicy(scope='pol', ob_space=ob_space, ac_space=ac_space, hidsize=512, feat_dim=512, ob_mean=ob_mean, ob_std=ob_std,
def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps # self._set_env_vars() self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std( None, hps['env'], nsteps=1, load=True) # env = self.make_env(256, add_monitor=False, sleep_multiple=1./32) # self.ob_space, self.ac_space = env.observation_space, env.action_space # env.close() # del env self.envs = [ functools.partial(self.make_env, i + 256 + 1) for i in range(envs_per_process) ] self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['feat_learning']] self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics( auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512) self.agent = PpoOptimizer( scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics, load=hps['load'], exp_name=hps['exp_name'], ) self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1])