def _set_env_vars(self):
     env = self.make_env(0)
     self.ob_space, self.ac_space = env.observation_space, env.action_space
     self.ob_mean, self.ob_std = random_agent_ob_mean_std(env)
     del env
     self.envs = [
         partial(self.make_env, i) for i in range(self.envs_per_process)
     ]
Exemple #2
0
 def _set_env_vars(self, hps):
     if True:
         self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std(
             None, hps['env'], nsteps=1, load=True)
     else:
         env = self.make_env(0, add_monitor=False, sleep_multiple=0)
         self.ob_space, self.ac_space = env.observation_space, env.action_space
         # self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std(env, hps['env'])
         self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std(
             env, hps['env'], nsteps=100 * 100, force_reset=100)
         # self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std(env, hps['env'], nsteps=10)
         env.close()
         del env
     self.envs = [
         functools.partial(self.make_env, i + 1)
         for i in range(self.envs_per_process)
     ]
 def _set_env_vars(self):
     env = self.make_env(0, add_monitor=False)
     self.ob_space, self.ac_space = env.observation_space, env.action_space
     self.ob_mean, self.ob_std = random_agent_ob_mean_std(env, nsteps=10000)
     del env
     self.envs = [
         functools.partial(self.make_env, i)
         for i in range(self.envs_per_process)
     ]
Exemple #4
0
 def _set_env_vars(self):
     import numpy as np
     env = self.make_env(0, add_monitor=False)
     self.ob_space, self.ac_space = env.observation_space, env.action_space
     self.env_ob_space = env.observation_space
     if self.depth_pred:
         self.ob_space = gym.spaces.Box(0, 255, shape=(84,84,3), dtype=np.uint8)
     self.ob_mean, self.ob_std = random_agent_ob_mean_std(env, depth_pred=self.hps['depth_pred'])
     del env
     self.envs = [functools.partial(self.make_env, i) for i in range(self.envs_per_process)]
Exemple #5
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std(
            None, hps['env'], nsteps=1, load=True)
        self.env = self.make_env(258)

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps['dyn_from_pixels'],
            feat_dim=512)
        self.agents = [
            # self.create_agent('presub095', hps),
            self.create_agent('presub089', hps),
            # self.create_agent('presub088', hps),
            # self.create_agent('presub087', hps),
            # self.create_agent('presub047', hps),
            # self.create_agent('presub018', hps),
            # self.create_agent('presub001', hps),
            # self.create_agent('presub002', hps),
            # self.create_agent('presub004', hps),
            # self.create_agent('presub005', hps),
            # self.create_agent('presub015', hps),
            # self.create_agent('presub016', hps),
            # self.create_agent('presub017', hps),
            # self.create_agent('presub019', hps),
            # self.create_agent('presub020', hps),
            # self.create_agent('presub021', hps),
        ]
 def _set_env_vars(self):
     env = self.make_env(0, add_monitor=False)
     assert self.hps[
         'env_kind'] == "dm_suite"  #Current code only runs with dm_suite, because of added action space min and max for clipping
     self.ac_space_min, self.ac_space_max = env.ac_space_min, env.ac_space_max
     self.ob_space, self.ac_space = env.observation_space, env.action_space
     self.ob_mean, self.ob_std = random_agent_ob_mean_std(env)
     del env
     self.envs = [
         functools.partial(self.make_env, i)
         for i in range(self.envs_per_process)
     ]
 def _set_env_vars(self):
     env = self.make_env(0, add_monitor=False)
     self.ob_space, self.ac_space = env.observation_space, env.action_space
     self.ob_mean, self.ob_std = random_agent_ob_mean_std(env)
     if self.hps["env_kind"] == "unity":
         env.close()
         # self.ob_mean, self.ob_std = 124.89177, 55.7459
     del env
     self.envs = [
         functools.partial(self.make_env, i)
         for i in range(self.envs_per_process)
     ]
Exemple #8
0
    def _set_env_vars(self):
        from time import sleep

        env = self.make_env(0, add_monitor=False)
        self.ob_space, self.ac_space = env.observation_space, env.action_space
        self.ob_mean, self.ob_std = random_agent_ob_mean_std(env)
        env.close()
        print("Waiting for 1 minute to make sure socket is closed on Linux")
        sleep(60)
        del env
        self.envs = [
            functools.partial(self.make_env, i)
            for i in range(self.envs_per_process)
        ]
Exemple #9
0
 def _set_env_vars(self):
     """
         该 env 仅是为了初始化 ob_space, ac_space, ob_mean, ob_std. 因此在算完之后 del 掉.
         随后初始化 self.envs_per_process 个 env
     """
     env = self.make_env(0, add_monitor=False)
     # ob_space.shape=(84, 84, 4)     ac_space.shape=Discrete(4)
     self.ob_space, self.ac_space = env.observation_space, env.action_space
     # 随机智能体与环境交互, 计算观测的均值和标准差. ob_mean.shape=(84,84,4), 是0-255之间的数. ob_std是标量, breakout中为 1.8
     self.ob_mean, self.ob_std = random_agent_ob_mean_std(env)
     del env
     self.envs = [
         functools.partial(self.make_env, i)
         for i in range(self.envs_per_process)
     ]
Exemple #10
0
 def _set_env_vars(self):
     print("Making env")
     env = self.make_env(0, add_monitor=False)
     print("Done make env")
     self.ob_space, self.ac_space = env.observation_space, env.action_space
     print("Got obs and action space")
     self.ob_mean, self.ob_std = random_agent_ob_mean_std(env, 1000)
     print("Got observations")
     del env
     print("Deleted env, making partial funcs")
     self.envs = [
         functools.partial(self.make_env, i)
         for i in range(self.envs_per_process)
     ]
     print("Envs:", self.envs)
Exemple #11
0
        env = ProcessFrame84(env, crop=False)  # 处理观测
        env = FrameStack(env, 4)  # 将连续4帧叠加起来作为输入
        return env

    make_env = partial(make_env_all_params, add_monitor=True)
    # make env
    env = make_env(0, add_monitor=False)
    obs = env.reset()
    print("obs and action space:",
          np.asarray(obs).shape, env.action_space.sample())

    ob_space, ac_space = env.observation_space, env.action_space

    # 一个随机智能体与环境交互, 计算得到的观测的均值和标准差.
    from utils import random_agent_ob_mean_std
    ob_mean, ob_std = random_agent_ob_mean_std(env)
    print("obs mean:", ob_mean.shape, np.max(ob_mean), np.min(ob_mean))
    print("obs std:", ob_std.shape, np.max(ob_std), np.min(ob_std))

    # 初始化环境
    envs = [partial(make_env, i) for i in range(5)]

    # CNN policy
    print("Init Policy.")
    policy = CnnPolicy(scope='pol',
                       ob_space=ob_space,
                       ac_space=ac_space,
                       hidsize=512,
                       feat_dim=512,
                       ob_mean=ob_mean,
                       ob_std=ob_std,
Exemple #12
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        # self._set_env_vars()

        self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std(
            None, hps['env'], nsteps=1, load=True)
        # env = self.make_env(256, add_monitor=False, sleep_multiple=1./32)
        # self.ob_space, self.ac_space = env.observation_space, env.action_space
        # env.close()
        # del env

        self.envs = [
            functools.partial(self.make_env, i + 256 + 1)
            for i in range(envs_per_process)
        ]

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps['dyn_from_pixels'],
            feat_dim=512)

        self.agent = PpoOptimizer(
            scope='ppo',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics,
            load=hps['load'],
            exp_name=hps['exp_name'],
        )

        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])