コード例 #1
0
class Tester(object):
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars(
        )  # 初始化 ob_space,ac_space,ob_mean,ob_std, 初始化 self.envs 包含多个环境模型

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = FeatureExtractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        # 初始化 环境模型 的类. 上述定义的 feature_extractor 将作为一个参数传入
        self.dynamics = DvaeDynamics(auxiliary_task=self.feature_extractor,
                                     reward_type=hps['reward_type'],
                                     sample_seeds=hps['sample_seeds'])

        self.agent = PpoOptimizer(scope='ppo',
                                  ob_space=self.ob_space,
                                  ac_space=self.ac_space,
                                  stochpol=self.policy,
                                  use_news=hps['use_news'],
                                  gamma=hps['gamma'],
                                  lam=hps["lambda"],
                                  nepochs=hps['nepochs'],
                                  nminibatches=hps['nminibatches'],
                                  lr=hps['lr'],
                                  cliprange=0.1,
                                  nsteps_per_seg=hps['nsteps_per_seg'],
                                  nsegs_per_env=hps['nsegs_per_env'],
                                  ent_coef=hps['ent_coeff'],
                                  normrew=hps['norm_rew'],
                                  normadv=hps['norm_adv'],
                                  ext_coeff=hps['ext_coeff'],
                                  int_coeff=hps['int_coeff'],
                                  dynamics=self.dynamics,
                                  nepochs_dvae=0)

        # agent 损失: 包括 actor,critic,entropy 损失; 先在加上 feature 学习时包含的损失
        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']

        # dynamic 损失,  将所有 dynamic 的损失累加起来
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']

        # 计算状态经过辅助任务提取特征的方差, shape=(512,), 下面取 tf.reduce_mean 后是一个标量
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

    def _set_env_vars(self):
        """
            该 env 仅是为了初始化 ob_space, ac_space, ob_mean, ob_std. 因此在算完之后 del 掉.
            随后初始化 self.envs_per_process 个 env
        """
        env = self.make_env(0)
        # ob_space.shape=(84, 84, 4)     ac_space.shape=Discrete(4)
        self.ob_space, self.ac_space = env.observation_space, env.action_space

        # 随机智能体与环境交互, 计算观测的均值和标准差. ob_mean.shape=(84,84,4), 是0-255之间的数. ob_std是标量, breakout中为 1.8
        self.ob_mean, self.ob_std = random_agent_ob_mean_std(env)
        if self.hps["env_kind"] == "unity":
            env.close()
        del env
        self.envs = [
            functools.partial(self.make_env, i)
            for i in range(self.envs_per_process)
        ]

    def play(self, tf_sess, args_tmp, saver, model_path):
        print("model_path: ", model_path)

        with tf_sess.as_default():
            print("Load wights..")
            saver.restore(tf_sess, model_path)
        print("Load done.")

        # rollout
        env = self.make_env(0)
        max_reward = -10000.
        for i in range(5):
            obs = env.reset()
            rews, frames = [], []
            while True:
                obs = np.expand_dims(np.squeeze(obs), axis=0)
                assert obs.shape == (1, 84, 84, 4)
                acs, vpreds, nlps = self.policy.get_ac_value_nlp(obs)
                obs, rew, done, info = env.step(acs[0])
                rews.append(rew)
                obs = np.array(obs)
                frames.append(env.render(mode='rgb_array'))
                if done:
                    break
            if max_reward < np.sum(rews):
                max_reward = np.sum(rews)
                print("Max rewards:", max_reward)
                save_np_as_mp4(
                    frames,
                    "/Users/bai/Desktop/video/" + args_tmp['env'] + '.mp4')
コード例 #2
0
class Scorer(object):
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        # self._set_env_vars()

        self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std(
            None, hps['env'], nsteps=1, load=True)
        # env = self.make_env(256, add_monitor=False, sleep_multiple=1./32)
        # self.ob_space, self.ac_space = env.observation_space, env.action_space
        # env.close()
        # del env

        self.envs = [
            functools.partial(self.make_env, i + 256 + 1)
            for i in range(envs_per_process)
        ]

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps['dyn_from_pixels'],
            feat_dim=512)

        self.agent = PpoOptimizer(
            scope='ppo',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics,
            load=hps['load'],
            exp_name=hps['exp_name'],
        )

        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

    def score(self):
        self.agent.start_interaction(self.envs,
                                     nlump=self.hps['nlumps'],
                                     dynamics=self.dynamics)
        from time import sleep
        sleep(2)
        episode_reward = 0
        episode_rewards = []
        total_episodes = 0
        max_level = 0
        max_levels = []
        while True:
            # info = self.agent.step()
            # self.agent.rollout.collect_rollout()
            obs, prevrews, news, infos = self.agent.rollout.env_get(0)
            if prevrews is not None:
                episode_reward += prevrews
                if prevrews == 1:
                    max_level += 1
                if news:
                    episode_rewards.append(episode_reward)
                    ave_reward = sum(episode_rewards) / len(episode_rewards)
                    total_episodes += 1
                    max_levels.append(max_level)
                    ave_level = sum(max_levels) / len(max_levels)
                    ave_level = np.around(ave_level, 2)
                    ave_reward = np.around(ave_reward, 2)
                    print('ep:', total_episodes, 'level:', max_level,
                          'ave_level:', ave_level, 'episode_reward:',
                          episode_reward, 'ave_reward', ave_reward)
                    episode_reward = 0
                    max_level = 0
                    if total_episodes >= 25:
                        break
            # acs, vpreds, nlps = self.agent.rollout.policy.get_ac_value_nlp(obs)
            # self.agent.rollout.env_step(0, acs)
            acs, vpreds, nlps = self.policy.get_ac_value_nlp(obs)
            self.agent.rollout.env_step(0, acs)
            self.agent.rollout.step_count += 1

        self.agent.stop_interaction()