class Tester(object): def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self._set_env_vars( ) # 初始化 ob_space,ac_space,ob_mean,ob_std, 初始化 self.envs 包含多个环境模型 self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = FeatureExtractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) # 初始化 环境模型 的类. 上述定义的 feature_extractor 将作为一个参数传入 self.dynamics = DvaeDynamics(auxiliary_task=self.feature_extractor, reward_type=hps['reward_type'], sample_seeds=hps['sample_seeds']) self.agent = PpoOptimizer(scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics, nepochs_dvae=0) # agent 损失: 包括 actor,critic,entropy 损失; 先在加上 feature 学习时包含的损失 self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] # dynamic 损失, 将所有 dynamic 的损失累加起来 self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] # 计算状态经过辅助任务提取特征的方差, shape=(512,), 下面取 tf.reduce_mean 后是一个标量 self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) def _set_env_vars(self): """ 该 env 仅是为了初始化 ob_space, ac_space, ob_mean, ob_std. 因此在算完之后 del 掉. 随后初始化 self.envs_per_process 个 env """ env = self.make_env(0) # ob_space.shape=(84, 84, 4) ac_space.shape=Discrete(4) self.ob_space, self.ac_space = env.observation_space, env.action_space # 随机智能体与环境交互, 计算观测的均值和标准差. ob_mean.shape=(84,84,4), 是0-255之间的数. ob_std是标量, breakout中为 1.8 self.ob_mean, self.ob_std = random_agent_ob_mean_std(env) if self.hps["env_kind"] == "unity": env.close() del env self.envs = [ functools.partial(self.make_env, i) for i in range(self.envs_per_process) ] def play(self, tf_sess, args_tmp, saver, model_path): print("model_path: ", model_path) with tf_sess.as_default(): print("Load wights..") saver.restore(tf_sess, model_path) print("Load done.") # rollout env = self.make_env(0) max_reward = -10000. for i in range(5): obs = env.reset() rews, frames = [], [] while True: obs = np.expand_dims(np.squeeze(obs), axis=0) assert obs.shape == (1, 84, 84, 4) acs, vpreds, nlps = self.policy.get_ac_value_nlp(obs) obs, rew, done, info = env.step(acs[0]) rews.append(rew) obs = np.array(obs) frames.append(env.render(mode='rgb_array')) if done: break if max_reward < np.sum(rews): max_reward = np.sum(rews) print("Max rewards:", max_reward) save_np_as_mp4( frames, "/Users/bai/Desktop/video/" + args_tmp['env'] + '.mp4')
class Scorer(object): def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps # self._set_env_vars() self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std( None, hps['env'], nsteps=1, load=True) # env = self.make_env(256, add_monitor=False, sleep_multiple=1./32) # self.ob_space, self.ac_space = env.observation_space, env.action_space # env.close() # del env self.envs = [ functools.partial(self.make_env, i + 256 + 1) for i in range(envs_per_process) ] self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['feat_learning']] self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics( auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512) self.agent = PpoOptimizer( scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics, load=hps['load'], exp_name=hps['exp_name'], ) self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) def score(self): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) from time import sleep sleep(2) episode_reward = 0 episode_rewards = [] total_episodes = 0 max_level = 0 max_levels = [] while True: # info = self.agent.step() # self.agent.rollout.collect_rollout() obs, prevrews, news, infos = self.agent.rollout.env_get(0) if prevrews is not None: episode_reward += prevrews if prevrews == 1: max_level += 1 if news: episode_rewards.append(episode_reward) ave_reward = sum(episode_rewards) / len(episode_rewards) total_episodes += 1 max_levels.append(max_level) ave_level = sum(max_levels) / len(max_levels) ave_level = np.around(ave_level, 2) ave_reward = np.around(ave_reward, 2) print('ep:', total_episodes, 'level:', max_level, 'ave_level:', ave_level, 'episode_reward:', episode_reward, 'ave_reward', ave_reward) episode_reward = 0 max_level = 0 if total_episodes >= 25: break # acs, vpreds, nlps = self.agent.rollout.policy.get_ac_value_nlp(obs) # self.agent.rollout.env_step(0, acs) acs, vpreds, nlps = self.policy.get_ac_value_nlp(obs) self.agent.rollout.env_step(0, acs) self.agent.rollout.step_count += 1 self.agent.stop_interaction()