Ejemplo n.º 1
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars()    # 初始化 ob_space,ac_space,ob_mean,ob_std, 初始化 self.envs 包含多个环境模型

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = FeatureExtractor(policy=self.policy,
                                                  features_shared_with_policy=False,
                                                  feat_dim=512,
                                                  layernormalize=hps['layernorm'])

        # 初始化 环境模型 的类. 上述定义的 feature_extractor 将作为一个参数传入
        self.dynamics = DvaeDynamics(auxiliary_task=self.feature_extractor,
                                     reward_type=hps['reward_type'])

        self.agent = PpoOptimizer(
            scope='ppo',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics,
            nepochs_dvae=0
        )

        # agent 损失: 包括 actor,critic,entropy 损失; 先在加上 feature 学习时包含的损失
        self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']

        # dynamic 损失,  将所有 dynamic 的损失累加起来
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']

        # 计算状态经过辅助任务提取特征的方差, shape=(512,), 下面取 tf.reduce_mean 后是一个标量
        self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
Ejemplo n.º 2
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std(
            None, hps['env'], nsteps=1, load=True)
        self.env = self.make_env(258)

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps['dyn_from_pixels'],
            feat_dim=512)
        self.agents = [
            # self.create_agent('presub095', hps),
            self.create_agent('presub089', hps),
            # self.create_agent('presub088', hps),
            # self.create_agent('presub087', hps),
            # self.create_agent('presub047', hps),
            # self.create_agent('presub018', hps),
            # self.create_agent('presub001', hps),
            # self.create_agent('presub002', hps),
            # self.create_agent('presub004', hps),
            # self.create_agent('presub005', hps),
            # self.create_agent('presub015', hps),
            # self.create_agent('presub016', hps),
            # self.create_agent('presub017', hps),
            # self.create_agent('presub019', hps),
            # self.create_agent('presub020', hps),
            # self.create_agent('presub021', hps),
        ]
Ejemplo n.º 3
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars()

        self.policy = CnnPolicy(
            scope='pol',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            hidsize=512,
            feat_dim=512,
            ob_mean=self.ob_mean,
            ob_std=self.ob_std,
            layernormalize=False,
            nl=tf.nn.leaky_relu)

        self.feature_extractor = {"none": FeatureExtractor,
                                  "idf": InverseDynamics,
                                  "vaesph": partial(VAE, spherical_obs=True),
                                  "vaenonsph": partial(VAE, spherical_obs=False),
                                  "pix2pix": JustPixels}[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(policy=self.policy,
                                                        features_shared_with_policy=False,
                                                        feat_dim=512,
                                                        layernormalize=hps['layernorm'])

        self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
        self.dynamics = self.dynamics(auxiliary_task=self.feature_extractor,
                                      predict_from_pixels=hps['dyn_from_pixels'],
                                      feat_dim=512)

        self.agent = PpoOptimizer(
            scope='ppo',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics
        )

        self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']
        self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
Ejemplo n.º 4
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars()

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=torch.nn.LeakyReLU)

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            # if we use VAE, 'features_shared_with_policy' should be set to False,
            # because the shape of output_features of VAE.get_features is feat_dims * 2, including means and stds,
            # but the shape of out_features of policy.get_features is feat_dims,
            # only means is used as features exposed to dynamics
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps['dyn_from_pixels'],
            feat_dim=512)

        self.agent = PpoOptimizer(scope='ppo',
                                  ob_space=self.ob_space,
                                  ac_space=self.ac_space,
                                  stochpol=self.policy,
                                  use_news=hps['use_news'],
                                  gamma=hps['gamma'],
                                  lam=hps["lambda"],
                                  nepochs=hps['nepochs'],
                                  nminibatches=hps['nminibatches'],
                                  lr=hps['lr'],
                                  cliprange=0.1,
                                  nsteps_per_seg=hps['nsteps_per_seg'],
                                  nsegs_per_env=hps['nsegs_per_env'],
                                  ent_coef=hps['ent_coeff'],
                                  normrew=hps['norm_rew'],
                                  normadv=hps['norm_adv'],
                                  ext_coeff=hps['ext_coeff'],
                                  int_coeff=hps['int_coeff'],
                                  dynamics=self.dynamics)
Ejemplo n.º 5
0
    def __init__(self, make_env, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars()

        self.policy = CnnPolicy(scope='cnn_pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)
        self.feature_extractor = InverseDynamics(policy=self.policy,
                                                 feat_dim=512,
                                                 layernormalize=0)
        self.dynamics = Dynamics(auxiliary_task=self.feature_extractor,
                                 mode=MODE,
                                 feat_dim=512)
        self.agent = PpoOptimizer(
            scope='ppo',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            policy=self.policy,
            use_news=0,
            gamma=.99,
            lam=.98,  #TODO Change this for potentially vastly different results
            nepochs=3,
            nminibatches=16,
            lr=1e-4,
            cliprange=.1,  #TODO Change this as well
            nsteps_per_seg=256,
            nsegs_per_env=1,
            ent_coeff=.001,
            normrew=1,
            normadv=1,
            ext_coeff=0.,
            int_coeff=1.,
            dynamics=self.dynamics)
        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
Ejemplo n.º 6
0
    # 一个随机智能体与环境交互, 计算得到的观测的均值和标准差.
    from utils import random_agent_ob_mean_std
    ob_mean, ob_std = random_agent_ob_mean_std(env)
    print("obs mean:", ob_mean.shape, np.max(ob_mean), np.min(ob_mean))
    print("obs std:", ob_std.shape, np.max(ob_std), np.min(ob_std))

    # 初始化环境
    envs = [partial(make_env, i) for i in range(5)]

    # CNN policy
    print("Init Policy.")
    policy = CnnPolicy(scope='pol',
                       ob_space=ob_space,
                       ac_space=ac_space,
                       hidsize=512,
                       feat_dim=512,
                       ob_mean=ob_mean,
                       ob_std=ob_std,
                       layernormalize=False,
                       nl=tf.nn.leaky_relu)

    print("Init Feature Extractor.")
    feature_extractor = FeatureExtractor(policy=policy,
                                         features_shared_with_policy=False,
                                         feat_dim=512,
                                         layernormalize=False)

    # agent 损失: 包括 actor,critic,entropy 损失; 先在加上 feature 学习时包含的损失
    print(feature_extractor.loss.shape)
    # feature_extractor.features.shape=(None,None,512)
    mean_std = tf.nn.moments(feature_extractor.features, [0, 1])
Ejemplo n.º 7
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars()

        self.policy = CnnPolicy(
            scope="pol",
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            hidsize=512,
            feat_dim=512,
            ob_mean=self.ob_mean,
            ob_std=self.ob_std,
            layernormalize=False,
            nl=tf.nn.leaky_relu,
        )

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels,
        }[hps["feat_learning"]]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps["layernorm"],
        )

        self.dynamics = Dynamics if hps["feat_learning"] != "pix2pix" else UNet
        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps["dyn_from_pixels"],
            feat_dim=512,
            ama=hps["ama"],
            uncertainty_penalty=hps["uncertainty_penalty"],
            clip_ama=hps["clip_ama"],
            clip_val=hps["clip_val"],
            reward_scaling=hps["reward_scaling"],
            abs_ama=hps["abs_ama"])
        self.agent = PpoOptimizer(
            scope="ppo",
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps["use_news"],
            gamma=hps["gamma"],
            lam=hps["lambda"],
            nepochs=hps["nepochs"],
            nminibatches=hps["nminibatches"],
            lr=hps["lr"],
            cliprange=0.1,
            nsteps_per_seg=hps["nsteps_per_seg"],
            nsegs_per_env=hps["nsegs_per_env"],
            ent_coef=hps["ent_coeff"],
            normrew=hps["norm_rew"],
            normadv=hps["norm_adv"],
            ext_coeff=hps["ext_coeff"],
            int_coeff=hps["int_coeff"],
            dynamics=self.dynamics,
            args=hps,
        )

        self.agent.to_report["aux"] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report["aux"]
        self.agent.to_report["dyn_loss"] = tf.reduce_mean(
            self.dynamics.loss[0])
        self.agent.total_loss += self.agent.to_report["dyn_loss"]
        self.agent.to_report["feat_var"] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
Ejemplo n.º 8
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars()

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=hps['feat_dim'],
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=hps['feat_dim'],
            layernormalize=hps['layernorm'])

        self.intrinsic_model = IntrinsicModel if hps[
            'feat_learning'] != 'pix2pix' else UNet
        self.intrinsic_model = self.intrinsic_model(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps['dyn_from_pixels'],
            feature_space=hps['feature_space'],
            nsteps_per_seg=hps['nsteps_per_seg'],
            feat_dim=hps['feat_dim'],
            naudio_samples=hps['naudio_samples'],
            train_discriminator=hps['train_discriminator'],
            discriminator_weighted=hps['discriminator_weighted'],
            noise_multiplier=hps['noise_multiplier'],
            concat=hps['concat'],
            log_dir=logger.get_dir(),
            make_video=hps['checkpoint_path'] != '')

        self.agent = PpoOptimizer(scope='ppo',
                                  ob_space=self.ob_space,
                                  ac_space=self.ac_space,
                                  stochpol=self.policy,
                                  use_news=hps['use_news'],
                                  gamma=hps['gamma'],
                                  lam=hps["lambda"],
                                  nepochs=hps['nepochs'],
                                  nminibatches=hps['nminibatches'],
                                  lr=hps['lr'],
                                  cliprange=0.1,
                                  nsteps_per_seg=hps['nsteps_per_seg'],
                                  nsegs_per_env=hps['nsegs_per_env'],
                                  ent_coef=hps['ent_coeff'],
                                  normrew=hps['norm_rew'],
                                  normadv=hps['norm_adv'],
                                  ext_coeff=hps['ext_coeff'],
                                  int_coeff=hps['int_coeff'],
                                  feature_space=hps['feature_space'],
                                  intrinsic_model=self.intrinsic_model,
                                  log_dir=logger.get_dir(),
                                  checkpoint_path=hps['checkpoint_path'])

        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']
        if hps['feature_space'] == 'joint':
            self.agent.to_report['dyn_visual_loss'] = tf.reduce_mean(
                self.intrinsic_model.visual_loss)
            self.agent.to_report['dyn_audio_loss'] = tf.reduce_mean(
                self.intrinsic_model.audio_loss)
            self.agent.to_report['discrim_train_loss'] = tf.reduce_mean(
                self.intrinsic_model.discrim_train_loss)
            self.agent.to_report['intrinsic_model_loss'] = tf.reduce_mean(
                self.intrinsic_model.loss)
        elif hps['train_discriminator']:
            self.agent.to_report['intrinsic_model_loss'] = tf.reduce_mean(
                self.intrinsic_model.discrim_train_loss)
        else:
            self.agent.to_report['intrinsic_model_loss'] = tf.reduce_mean(
                self.intrinsic_model.loss)
        self.agent.total_loss += self.agent.to_report['intrinsic_model_loss']
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
Ejemplo n.º 9
0
def main(args):
    # mpi communicator.
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()

    # seed.
    workerseed = args.seed + 10000 * comm.Get_rank() if args.seed is not None else None
    if workerseed is not None:
        tc.manual_seed(workerseed % 2 ** 32)
        np.random.seed(workerseed % 2 ** 32)
        random.seed(workerseed % 2 ** 32)

    # logger.
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])

    # env.
    env = make_atari(args.env_name)
    env.seed(workerseed)
    env = Monitor(env, logger.get_dir() and
              os.path.join(logger.get_dir(), str(rank)))
    print(f"frame_stacking: {args.frame_stacking}")
    env = wrap_deepmind(env, frame_stack=args.frame_stacking,
                        clip_rewards=(args.mode =='train'),
                        episode_life=(args.mode =='train'))  # See Mnih et al., 2015 -> Methods -> Training Details.
    env.seed(workerseed)

    # agent.
    agent = CnnPolicy(
        img_channels=env.observation_space.shape[-1],
        num_actions=env.action_space.n,
        kind=args.model_type)

    # optimizer and scheduler.
    max_grad_steps = args.optim_epochs * args.env_steps // (comm.Get_size() * args.optim_batchsize)

    optimizer = tc.optim.Adam(agent.parameters(), lr=args.optim_stepsize, eps=1e-5)
    scheduler = tc.optim.lr_scheduler.OneCycleLR(
        optimizer=optimizer, max_lr=args.optim_stepsize, total_steps=max_grad_steps,
        pct_start=0.0, anneal_strategy='linear', cycle_momentum=False,
        div_factor=1.0)

    # checkpoint.
    if rank == 0:
        try:
            state_dict = tc.load(os.path.join(args.checkpoint_dir, args.model_name, 'model.pth'))
            agent.load_state_dict(state_dict)
            print(f"Continuing from checkpoint found at {os.path.join(args.checkpoint_dir, args.model_name, 'model.pth')}")
        except FileNotFoundError:
            print("Bad checkpoint or none on process 0. Continuing from scratch.")

    # sync.
    with tc.no_grad():
        for p in agent.parameters():
            p_data = p.data.numpy()
            comm.Bcast(p_data, root=0)
            p.data.copy_(tc.tensor(p_data).float())

    # operations.
    if args.mode == 'train':
        learn(env=env, agent=agent, optimizer=optimizer, scheduler=scheduler, comm=comm,
              timesteps_per_actorbatch=args.timesteps_per_actorbatch, max_timesteps=args.env_steps,
              optim_epochs=args.optim_epochs, optim_batchsize=args.optim_batchsize,
              gamma=args.gamma, lam=args.lam, clip_param=args.epsilon, entcoeff=args.ent_coef,
              checkpoint_dir=args.checkpoint_dir, model_name=args.model_name)
        env.close()

    elif args.mode == 'play':
        if comm.Get_rank() == 0:
            play(env=env, agent=agent, args=args)
            env.close()

    elif args.mode == 'movie':
        if comm.Get_rank() == 0:
            movie(env=env, agent=agent, args=args)
            env.close()

    else:
        raise NotImplementedError("Mode of operation not supported!")
Ejemplo n.º 10
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars(
        )  # 初始化 ob_space,ac_space,ob_mean,ob_std, 初始化 self.envs 包含多个环境模型

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        # 在建立环境模型之前, 先初始特征提取器. 定义在 auxiliary_task.py 中. 其中 pix2pix 相当于没有提取特征.
        self.feature_extractor = {
            "none": FeatureExtractor,  # 默认是none
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]  # 通过hps参数选择一个特征提取器
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        # 初始化 环境模型 的类. 上述定义的 feature_extractor 将作为一个参数传入
        self.dynamics = DvaeDynamics(auxiliary_task=self.feature_extractor,
                                     reward_type=hps['reward_type'],
                                     sample_seeds=hps['sample_seeds'])

        self.agent = PpoOptimizer(
            scope='ppo',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics,  # dynamic 对象
            nepochs_dvae=hps["nepochs_dvae"]  # 额外训练 dynamic 的次数
        )

        # agent 损失: 包括 actor,critic,entropy 损失; 先在加上 feature 学习时包含的损失
        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']

        # dynamic 损失
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']

        # add bai. 单独记录DAVE损失,可能要多次训练DAVE
        self.agent.dynamics_loss = self.agent.to_report['dyn_loss']

        # 计算状态经过辅助任务提取特征的方差, shape=(512,), 下面取 tf.reduce_mean 后是一个标量
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
Ejemplo n.º 11
0
    def __init__(self, make_env, hps, num_timesteps, envs_per_process, exp_name=None, env_name=None, policy=None, feat_ext=None, dyn=None, agent_num=None, restore_name=None):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.depth_pred = hps['depth_pred']
        self.aux_input = hps['aux_input']
        self.num_timesteps = num_timesteps
        self._set_env_vars()
        if exp_name:
            self.exp_name = exp_name
        else:
            self.exp_name = hps['exp_name']
        if env_name:
            self.env_name = env_name
        else:
            self.env_name = hps['env']
        if policy is None:
            if hps['lstm']:
                self.policy = LSTMPolicy(
                    scope='pol',
                    ob_space=self.ob_space,
                    ac_space=self.ac_space,
                    hidsize=512,
                    batchsize=hps['envs_per_process'],
                    feat_dim=512,
                    ob_mean=self.ob_mean,
                    ob_std=self.ob_std,
                    lstm1_size=hps['lstm1_size'],
                    lstm2_size=hps['lstm2_size'],
                    layernormalize=False,
                    nl=tf.nn.leaky_relu,
                    depth_pred=hps['depth_pred'],
                    aux_input=hps['aux_input'],
                )

            else:
                self.policy = CnnPolicy(
                    scope='pol',
                    ob_space=self.ob_space,
                    ac_space=self.ac_space,
                    hidsize=512,
                    feat_dim=512,
                    ob_mean=self.ob_mean,
                    ob_std=self.ob_std,
                    layernormalize=False,
                    nl=tf.nn.leaky_relu
                )
        else:
            self.policy = policy
            self.policy.restore()

        if feat_ext:
            self.feature_extractor = feat_ext
            self.feature_extractor.restore()
        else:

            self.feature_extractor = {"none": FeatureExtractor,
                                      "idf": InverseDynamics,
                                      "vaesph": partial(VAE, spherical_obs=True),
                                      "vaenonsph": partial(VAE, spherical_obs=False),
                                      "pix2pix": JustPixels}[hps['feat_learning']]

            self.feature_extractor = self.feature_extractor(policy=self.policy,
                                                            features_shared_with_policy=hps['feat_share'],
                                                            feat_dim=hps['dyn_feat_dim'],
                                                            layernormalize=hps['layernorm'])
        if dyn:
            self.dynamics = dyn
            self.dynamics.restore()
        else:

            self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
            self.dynamics = self.dynamics(auxiliary_task=self.feature_extractor,
                                          predict_from_pixels=hps['dyn_from_pixels'],
                                          feat_dim=hps['dyn_feat_dim'])
        self.agent = PpoOptimizer(
            hps=hps,
            scope='ppo',
            ob_space=self.ob_space,
            env_ob_space=self.env_ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics,
            exp_name=self.exp_name,
            env_name=self.env_name,
            video_log_freq=hps['video_log_freq'],
            model_save_freq=hps['model_save_freq'],
            use_apples=hps['use_apples'],
            agent_num=agent_num,
            restore_name=restore_name,
            multi_envs=hps['multi_train_envs'],
            lstm=hps['lstm'],
            lstm1_size=hps['lstm1_size'],
            lstm2_size=hps['lstm2_size'],
            depth_pred=hps['depth_pred'],
            aux_input=hps['aux_input'],
            beta_d=hps['beta'],
            early_stop=hps['early_stop'],
            optim=hps['optim'],
            decay=hps['decay'],
            grad_clip=hps['grad_clip'],
            log_grads=hps['log_grads'],
            logdir=hps['logdir']
        )
        self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0,1])[1])
        if hps['curiosity']:
            #self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
            self.agent.total_loss += hps['aux_loss_coeff']*self.agent.to_report['aux']
            #self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
            self.agent.total_loss += hps['dyn_loss_coeff']*self.agent.to_report['dyn_loss']
Ejemplo n.º 12
0
class Trainer(object):
    from baselines import logger
    def __init__(self, make_env, hps, num_timesteps, envs_per_process, exp_name=None, env_name=None, policy=None, feat_ext=None, dyn=None, agent_num=None, restore_name=None):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.depth_pred = hps['depth_pred']
        self.aux_input = hps['aux_input']
        self.num_timesteps = num_timesteps
        self._set_env_vars()
        if exp_name:
            self.exp_name = exp_name
        else:
            self.exp_name = hps['exp_name']
        if env_name:
            self.env_name = env_name
        else:
            self.env_name = hps['env']
        if policy is None:
            if hps['lstm']:
                self.policy = LSTMPolicy(
                    scope='pol',
                    ob_space=self.ob_space,
                    ac_space=self.ac_space,
                    hidsize=512,
                    batchsize=hps['envs_per_process'],
                    feat_dim=512,
                    ob_mean=self.ob_mean,
                    ob_std=self.ob_std,
                    lstm1_size=hps['lstm1_size'],
                    lstm2_size=hps['lstm2_size'],
                    layernormalize=False,
                    nl=tf.nn.leaky_relu,
                    depth_pred=hps['depth_pred'],
                    aux_input=hps['aux_input'],
                )

            else:
                self.policy = CnnPolicy(
                    scope='pol',
                    ob_space=self.ob_space,
                    ac_space=self.ac_space,
                    hidsize=512,
                    feat_dim=512,
                    ob_mean=self.ob_mean,
                    ob_std=self.ob_std,
                    layernormalize=False,
                    nl=tf.nn.leaky_relu
                )
        else:
            self.policy = policy
            self.policy.restore()

        if feat_ext:
            self.feature_extractor = feat_ext
            self.feature_extractor.restore()
        else:

            self.feature_extractor = {"none": FeatureExtractor,
                                      "idf": InverseDynamics,
                                      "vaesph": partial(VAE, spherical_obs=True),
                                      "vaenonsph": partial(VAE, spherical_obs=False),
                                      "pix2pix": JustPixels}[hps['feat_learning']]

            self.feature_extractor = self.feature_extractor(policy=self.policy,
                                                            features_shared_with_policy=hps['feat_share'],
                                                            feat_dim=hps['dyn_feat_dim'],
                                                            layernormalize=hps['layernorm'])
        if dyn:
            self.dynamics = dyn
            self.dynamics.restore()
        else:

            self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
            self.dynamics = self.dynamics(auxiliary_task=self.feature_extractor,
                                          predict_from_pixels=hps['dyn_from_pixels'],
                                          feat_dim=hps['dyn_feat_dim'])
        self.agent = PpoOptimizer(
            hps=hps,
            scope='ppo',
            ob_space=self.ob_space,
            env_ob_space=self.env_ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics,
            exp_name=self.exp_name,
            env_name=self.env_name,
            video_log_freq=hps['video_log_freq'],
            model_save_freq=hps['model_save_freq'],
            use_apples=hps['use_apples'],
            agent_num=agent_num,
            restore_name=restore_name,
            multi_envs=hps['multi_train_envs'],
            lstm=hps['lstm'],
            lstm1_size=hps['lstm1_size'],
            lstm2_size=hps['lstm2_size'],
            depth_pred=hps['depth_pred'],
            aux_input=hps['aux_input'],
            beta_d=hps['beta'],
            early_stop=hps['early_stop'],
            optim=hps['optim'],
            decay=hps['decay'],
            grad_clip=hps['grad_clip'],
            log_grads=hps['log_grads'],
            logdir=hps['logdir']
        )
        self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0,1])[1])
        if hps['curiosity']:
            #self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
            self.agent.total_loss += hps['aux_loss_coeff']*self.agent.to_report['aux']
            #self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
            self.agent.total_loss += hps['dyn_loss_coeff']*self.agent.to_report['dyn_loss']
            #self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

    def _set_env_vars(self):
        import numpy as np
        env = self.make_env(0, add_monitor=False)
        self.ob_space, self.ac_space = env.observation_space, env.action_space
        self.env_ob_space = env.observation_space
        if self.depth_pred:
            self.ob_space = gym.spaces.Box(0, 255, shape=(84,84,3), dtype=np.uint8)
        self.ob_mean, self.ob_std = random_agent_ob_mean_std(env, depth_pred=self.hps['depth_pred'])
        del env
        self.envs = [functools.partial(self.make_env, i) for i in range(self.envs_per_process)]

    def train(self, saver, sess, restore=False):
        from baselines import logger
        self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics)
        if restore:
            print("Restoring model for training")
            saver.restore(sess, "models/" + self.hps['restore_model'] + ".ckpt")
            print("Loaded model", self.hps['restore_model'])
        write_meta_graph = False
        while True:
            info = self.agent.step()
            if info['update']:
                if info['update']['recent_best_ext_ret'] is None:
                    info['update']['recent_best_ext_ret'] = 0
                wandb.log(info['update'])
                logger.logkvs(info['update'])
                logger.dumpkvs()
            if self.agent.rollout.stats['tcount'] > self.num_timesteps:
                break
        if self.hps['tune_env']:
            filename = "models/" + self.hps['restore_model'] + "_tune_on_" + self.hps['tune_env'] + "_final.ckpt"
        else:
            filename = "models/" + self.hps['exp_name'] + "_final.ckpt"
        saver.save(sess, filename, write_meta_graph=False)
        self.policy.save_model(self.hps['exp_name'], 'final')
        self.agent.stop_interaction()
class Trainer(object):
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars()

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps['dyn_from_pixels'],
            pred_discount=hps['pred_discount'],
            num_preds=hps['num_preds'],
            feat_dim=512)
        ''' Setting dynamics object in policy for feature extraction'''
        self.policy.set_dynamics(self.dynamics)
        self.dynamics.set_loss()

        self.agent = PpoOptimizer(scope='ppo',
                                  ob_space=self.ob_space,
                                  ac_space=self.ac_space,
                                  stochpol=self.policy,
                                  use_news=hps['use_news'],
                                  gamma=hps['gamma'],
                                  lam=hps["lambda"],
                                  nepochs=hps['nepochs'],
                                  nminibatches=hps['nminibatches'],
                                  lr=hps['lr'],
                                  cliprange=0.1,
                                  nsteps_per_seg=hps['nsteps_per_seg'],
                                  nsegs_per_env=hps['nsegs_per_env'],
                                  ent_coef=hps['ent_coeff'],
                                  normrew=hps['norm_rew'],
                                  normadv=hps['norm_adv'],
                                  ext_coeff=hps['ext_coeff'],
                                  int_coeff=hps['int_coeff'],
                                  dynamics=self.dynamics)

        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss1 +
                                                          self.dynamics.loss2)
        self.agent.total_loss += self.agent.to_report['dyn_loss']
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

    def _set_env_vars(self):
        env = self.make_env(0, add_monitor=True)
        self.ob_space, self.ac_space = env.observation_space, env.action_space
        self.ob_mean, self.ob_std = random_agent_ob_mean_std(env)
        del env
        self.envs = [
            functools.partial(self.make_env, i)
            for i in range(self.envs_per_process)
        ]

    def train(self):
        self.agent.start_interaction(self.envs,
                                     nlump=self.hps['nlumps'],
                                     dynamics=self.dynamics)
        while True:
            info = self.agent.step()
            if info['update']:
                logger.logkvs(info['update'])
                logger.dumpkvs()
            if self.agent.rollout.stats['tcount'] > self.num_timesteps:
                break

        self.agent.stop_interaction()
Ejemplo n.º 14
0
class Tester(object):
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self._set_env_vars(
        )  # 初始化 ob_space,ac_space,ob_mean,ob_std, 初始化 self.envs 包含多个环境模型

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = FeatureExtractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        # 初始化 环境模型 的类. 上述定义的 feature_extractor 将作为一个参数传入
        self.dynamics = DvaeDynamics(auxiliary_task=self.feature_extractor,
                                     reward_type=hps['reward_type'],
                                     sample_seeds=hps['sample_seeds'])

        self.agent = PpoOptimizer(scope='ppo',
                                  ob_space=self.ob_space,
                                  ac_space=self.ac_space,
                                  stochpol=self.policy,
                                  use_news=hps['use_news'],
                                  gamma=hps['gamma'],
                                  lam=hps["lambda"],
                                  nepochs=hps['nepochs'],
                                  nminibatches=hps['nminibatches'],
                                  lr=hps['lr'],
                                  cliprange=0.1,
                                  nsteps_per_seg=hps['nsteps_per_seg'],
                                  nsegs_per_env=hps['nsegs_per_env'],
                                  ent_coef=hps['ent_coeff'],
                                  normrew=hps['norm_rew'],
                                  normadv=hps['norm_adv'],
                                  ext_coeff=hps['ext_coeff'],
                                  int_coeff=hps['int_coeff'],
                                  dynamics=self.dynamics,
                                  nepochs_dvae=0)

        # agent 损失: 包括 actor,critic,entropy 损失; 先在加上 feature 学习时包含的损失
        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']

        # dynamic 损失,  将所有 dynamic 的损失累加起来
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']

        # 计算状态经过辅助任务提取特征的方差, shape=(512,), 下面取 tf.reduce_mean 后是一个标量
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

    def _set_env_vars(self):
        """
            该 env 仅是为了初始化 ob_space, ac_space, ob_mean, ob_std. 因此在算完之后 del 掉.
            随后初始化 self.envs_per_process 个 env
        """
        env = self.make_env(0)
        # ob_space.shape=(84, 84, 4)     ac_space.shape=Discrete(4)
        self.ob_space, self.ac_space = env.observation_space, env.action_space

        # 随机智能体与环境交互, 计算观测的均值和标准差. ob_mean.shape=(84,84,4), 是0-255之间的数. ob_std是标量, breakout中为 1.8
        self.ob_mean, self.ob_std = random_agent_ob_mean_std(env)
        if self.hps["env_kind"] == "unity":
            env.close()
        del env
        self.envs = [
            functools.partial(self.make_env, i)
            for i in range(self.envs_per_process)
        ]

    def play(self, tf_sess, args_tmp, saver, model_path):
        print("model_path: ", model_path)

        with tf_sess.as_default():
            print("Load wights..")
            saver.restore(tf_sess, model_path)
        print("Load done.")

        # rollout
        env = self.make_env(0)
        max_reward = -10000.
        for i in range(5):
            obs = env.reset()
            rews, frames = [], []
            while True:
                obs = np.expand_dims(np.squeeze(obs), axis=0)
                assert obs.shape == (1, 84, 84, 4)
                acs, vpreds, nlps = self.policy.get_ac_value_nlp(obs)
                obs, rew, done, info = env.step(acs[0])
                rews.append(rew)
                obs = np.array(obs)
                frames.append(env.render(mode='rgb_array'))
                if done:
                    break
            if max_reward < np.sum(rews):
                max_reward = np.sum(rews)
                print("Max rewards:", max_reward)
                save_np_as_mp4(
                    frames,
                    "/Users/bai/Desktop/video/" + args_tmp['env'] + '.mp4')
Ejemplo n.º 15
0
class Scorer(object):
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std(
            None, hps['env'], nsteps=1, load=True)
        self.env = self.make_env(258)

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps['dyn_from_pixels'],
            feat_dim=512)
        self.agents = [
            # self.create_agent('presub095', hps),
            self.create_agent('presub089', hps),
            # self.create_agent('presub088', hps),
            # self.create_agent('presub087', hps),
            # self.create_agent('presub047', hps),
            # self.create_agent('presub018', hps),
            # self.create_agent('presub001', hps),
            # self.create_agent('presub002', hps),
            # self.create_agent('presub004', hps),
            # self.create_agent('presub005', hps),
            # self.create_agent('presub015', hps),
            # self.create_agent('presub016', hps),
            # self.create_agent('presub017', hps),
            # self.create_agent('presub019', hps),
            # self.create_agent('presub020', hps),
            # self.create_agent('presub021', hps),
        ]

    def create_agent(self, exp_name, hps):
        # graph = tf.Graph()
        # graph.as_default()
        agent = PpoOptimizer(
            scope=exp_name,
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics,
            load=hps['load'],
            exp_name=exp_name,
        )

        agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss)
        agent.total_loss += agent.to_report['aux']
        agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        agent.total_loss += agent.to_report['dyn_loss']
        agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

        # agent.graph = graph
        # tf.reset_default_graph()

        return agent

    def score(self):
        episode_reward = 0
        episode_rewards = []
        total_episodes = 0
        samples = 0
        obs = np.empty((len(self.agents) + samples, 1, *self.ob_space.shape),
                       np.float32)
        obs[0] = self.env.reset()
        max_level = 0
        max_levels = []
        for agent in self.agents:
            agent.no_mpi_start_interaction([self.env],
                                           nlump=self.hps['nlumps'],
                                           dynamics=self.dynamics)

        # if is_grading(self.env):
        #     while not done_grading(self.env):
        #         # run_episode(env)
        #         done = False
        #         episode_reward = 0.0

        #         while not done:
        #             action = env.action_space.sample()
        #             obs, reward, done, info = env.step(action)
        #             episode_reward += reward

        #         self.env.reset()
        #     return

        while True:
            # aa = obs.reshape([len(obs) * 1, 1, *self.ob_space.shape])

            for i in range(len(self.agents) - 1):
                obs[1 + i] = obs[0]
            for i in range(samples):
                mu, sigma = 0, 0.1
                noise = np.random.normal(mu, sigma, obs[0].shape)
                obs[len(self.agents) + i] = obs[0] + noise
            # obs[1] = np.copy(obs[0])
            # obs[1] = cv2.randn(obs[1],(128),(9))
            action_scores, acs, vpreds, nlps = self.policy.inference_get_ac_value_nlp(
                obs)
            max_actions = np.unravel_index(action_scores.argmax(),
                                           action_scores.shape)
            max_action = max_actions[1]
            max_v = vpreds.argmax()
            max_npl = nlps.argmax()
            min_npl = nlps.argmin()
            action = acs[0]  # default
            # action = int(max_action) # based on highest scoring action
            # action = int(acs[max_v]) # based on highest value
            # action = int(acs[min_npl]) # based on min npl
            # action = action_scores[min_npl].argmax()
            ob, reward, done, _ = self.env.step(action)
            obs[0] = ob
            episode_reward += reward
            if reward == 1:
                max_level += 1
            if done:
                episode_rewards.append(episode_reward)
                ave_reward = sum(episode_rewards) / len(episode_rewards)
                total_episodes += 1
                max_levels.append(max_level)
                ave_level = sum(max_levels) / len(max_levels)
                print('ep:', total_episodes, 'level:', max_level, 'ave_level:',
                      round(ave_level, 2), 'episode_reward:', episode_reward,
                      'ave_reward', round(ave_reward, 2))
                episode_reward = 0
                max_level = 0
                if is_grading(self.env):
                    if done_grading(self.env):
                        break
                elif total_episodes >= 25:
                    break
                obs[0] = self.env.reset()
        self.env.close()
Ejemplo n.º 16
0
class Scorer(object):
    def __init__(self, make_env, hps, num_timesteps, envs_per_process):
        self.make_env = make_env
        self.hps = hps
        self.envs_per_process = envs_per_process
        self.num_timesteps = num_timesteps
        # self._set_env_vars()

        self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std(
            None, hps['env'], nsteps=1, load=True)
        # env = self.make_env(256, add_monitor=False, sleep_multiple=1./32)
        # self.ob_space, self.ac_space = env.observation_space, env.action_space
        # env.close()
        # del env

        self.envs = [
            functools.partial(self.make_env, i + 256 + 1)
            for i in range(envs_per_process)
        ]

        self.policy = CnnPolicy(scope='pol',
                                ob_space=self.ob_space,
                                ac_space=self.ac_space,
                                hidsize=512,
                                feat_dim=512,
                                ob_mean=self.ob_mean,
                                ob_std=self.ob_std,
                                layernormalize=False,
                                nl=tf.nn.leaky_relu)

        self.feature_extractor = {
            "none": FeatureExtractor,
            "idf": InverseDynamics,
            "vaesph": partial(VAE, spherical_obs=True),
            "vaenonsph": partial(VAE, spherical_obs=False),
            "pix2pix": JustPixels
        }[hps['feat_learning']]
        self.feature_extractor = self.feature_extractor(
            policy=self.policy,
            features_shared_with_policy=False,
            feat_dim=512,
            layernormalize=hps['layernorm'])

        self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet
        self.dynamics = self.dynamics(
            auxiliary_task=self.feature_extractor,
            predict_from_pixels=hps['dyn_from_pixels'],
            feat_dim=512)

        self.agent = PpoOptimizer(
            scope='ppo',
            ob_space=self.ob_space,
            ac_space=self.ac_space,
            stochpol=self.policy,
            use_news=hps['use_news'],
            gamma=hps['gamma'],
            lam=hps["lambda"],
            nepochs=hps['nepochs'],
            nminibatches=hps['nminibatches'],
            lr=hps['lr'],
            cliprange=0.1,
            nsteps_per_seg=hps['nsteps_per_seg'],
            nsegs_per_env=hps['nsegs_per_env'],
            ent_coef=hps['ent_coeff'],
            normrew=hps['norm_rew'],
            normadv=hps['norm_adv'],
            ext_coeff=hps['ext_coeff'],
            int_coeff=hps['int_coeff'],
            dynamics=self.dynamics,
            load=hps['load'],
            exp_name=hps['exp_name'],
        )

        self.agent.to_report['aux'] = tf.reduce_mean(
            self.feature_extractor.loss)
        self.agent.total_loss += self.agent.to_report['aux']
        self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss)
        self.agent.total_loss += self.agent.to_report['dyn_loss']
        self.agent.to_report['feat_var'] = tf.reduce_mean(
            tf.nn.moments(self.feature_extractor.features, [0, 1])[1])

    def score(self):
        self.agent.start_interaction(self.envs,
                                     nlump=self.hps['nlumps'],
                                     dynamics=self.dynamics)
        from time import sleep
        sleep(2)
        episode_reward = 0
        episode_rewards = []
        total_episodes = 0
        max_level = 0
        max_levels = []
        while True:
            # info = self.agent.step()
            # self.agent.rollout.collect_rollout()
            obs, prevrews, news, infos = self.agent.rollout.env_get(0)
            if prevrews is not None:
                episode_reward += prevrews
                if prevrews == 1:
                    max_level += 1
                if news:
                    episode_rewards.append(episode_reward)
                    ave_reward = sum(episode_rewards) / len(episode_rewards)
                    total_episodes += 1
                    max_levels.append(max_level)
                    ave_level = sum(max_levels) / len(max_levels)
                    ave_level = np.around(ave_level, 2)
                    ave_reward = np.around(ave_reward, 2)
                    print('ep:', total_episodes, 'level:', max_level,
                          'ave_level:', ave_level, 'episode_reward:',
                          episode_reward, 'ave_reward', ave_reward)
                    episode_reward = 0
                    max_level = 0
                    if total_episodes >= 25:
                        break
            # acs, vpreds, nlps = self.agent.rollout.policy.get_ac_value_nlp(obs)
            # self.agent.rollout.env_step(0, acs)
            acs, vpreds, nlps = self.policy.get_ac_value_nlp(obs)
            self.agent.rollout.env_step(0, acs)
            self.agent.rollout.step_count += 1

        self.agent.stop_interaction()