def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self._set_env_vars() # 初始化 ob_space,ac_space,ob_mean,ob_std, 初始化 self.envs 包含多个环境模型 self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = FeatureExtractor(policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) # 初始化 环境模型 的类. 上述定义的 feature_extractor 将作为一个参数传入 self.dynamics = DvaeDynamics(auxiliary_task=self.feature_extractor, reward_type=hps['reward_type']) self.agent = PpoOptimizer( scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics, nepochs_dvae=0 ) # agent 损失: 包括 actor,critic,entropy 损失; 先在加上 feature 学习时包含的损失 self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] # dynamic 损失, 将所有 dynamic 的损失累加起来 self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] # 计算状态经过辅助任务提取特征的方差, shape=(512,), 下面取 tf.reduce_mean 后是一个标量 self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std( None, hps['env'], nsteps=1, load=True) self.env = self.make_env(258) self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['feat_learning']] self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics( auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512) self.agents = [ # self.create_agent('presub095', hps), self.create_agent('presub089', hps), # self.create_agent('presub088', hps), # self.create_agent('presub087', hps), # self.create_agent('presub047', hps), # self.create_agent('presub018', hps), # self.create_agent('presub001', hps), # self.create_agent('presub002', hps), # self.create_agent('presub004', hps), # self.create_agent('presub005', hps), # self.create_agent('presub015', hps), # self.create_agent('presub016', hps), # self.create_agent('presub017', hps), # self.create_agent('presub019', hps), # self.create_agent('presub020', hps), # self.create_agent('presub021', hps), ]
def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self._set_env_vars() self.policy = CnnPolicy( scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = {"none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels}[hps['feat_learning']] self.feature_extractor = self.feature_extractor(policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics(auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512) self.agent = PpoOptimizer( scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics ) self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self._set_env_vars() self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=torch.nn.LeakyReLU) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['feat_learning']] self.feature_extractor = self.feature_extractor( policy=self.policy, # if we use VAE, 'features_shared_with_policy' should be set to False, # because the shape of output_features of VAE.get_features is feat_dims * 2, including means and stds, # but the shape of out_features of policy.get_features is feat_dims, # only means is used as features exposed to dynamics features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics( auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512) self.agent = PpoOptimizer(scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics)
def __init__(self, make_env, num_timesteps, envs_per_process): self.make_env = make_env self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self._set_env_vars() self.policy = CnnPolicy(scope='cnn_pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = InverseDynamics(policy=self.policy, feat_dim=512, layernormalize=0) self.dynamics = Dynamics(auxiliary_task=self.feature_extractor, mode=MODE, feat_dim=512) self.agent = PpoOptimizer( scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, policy=self.policy, use_news=0, gamma=.99, lam=.98, #TODO Change this for potentially vastly different results nepochs=3, nminibatches=16, lr=1e-4, cliprange=.1, #TODO Change this as well nsteps_per_seg=256, nsegs_per_env=1, ent_coeff=.001, normrew=1, normadv=1, ext_coeff=0., int_coeff=1., dynamics=self.dynamics) self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
# 一个随机智能体与环境交互, 计算得到的观测的均值和标准差. from utils import random_agent_ob_mean_std ob_mean, ob_std = random_agent_ob_mean_std(env) print("obs mean:", ob_mean.shape, np.max(ob_mean), np.min(ob_mean)) print("obs std:", ob_std.shape, np.max(ob_std), np.min(ob_std)) # 初始化环境 envs = [partial(make_env, i) for i in range(5)] # CNN policy print("Init Policy.") policy = CnnPolicy(scope='pol', ob_space=ob_space, ac_space=ac_space, hidsize=512, feat_dim=512, ob_mean=ob_mean, ob_std=ob_std, layernormalize=False, nl=tf.nn.leaky_relu) print("Init Feature Extractor.") feature_extractor = FeatureExtractor(policy=policy, features_shared_with_policy=False, feat_dim=512, layernormalize=False) # agent 损失: 包括 actor,critic,entropy 损失; 先在加上 feature 学习时包含的损失 print(feature_extractor.loss.shape) # feature_extractor.features.shape=(None,None,512) mean_std = tf.nn.moments(feature_extractor.features, [0, 1])
def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self._set_env_vars() self.policy = CnnPolicy( scope="pol", ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu, ) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels, }[hps["feat_learning"]] self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps["layernorm"], ) self.dynamics = Dynamics if hps["feat_learning"] != "pix2pix" else UNet self.dynamics = self.dynamics( auxiliary_task=self.feature_extractor, predict_from_pixels=hps["dyn_from_pixels"], feat_dim=512, ama=hps["ama"], uncertainty_penalty=hps["uncertainty_penalty"], clip_ama=hps["clip_ama"], clip_val=hps["clip_val"], reward_scaling=hps["reward_scaling"], abs_ama=hps["abs_ama"]) self.agent = PpoOptimizer( scope="ppo", ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps["use_news"], gamma=hps["gamma"], lam=hps["lambda"], nepochs=hps["nepochs"], nminibatches=hps["nminibatches"], lr=hps["lr"], cliprange=0.1, nsteps_per_seg=hps["nsteps_per_seg"], nsegs_per_env=hps["nsegs_per_env"], ent_coef=hps["ent_coeff"], normrew=hps["norm_rew"], normadv=hps["norm_adv"], ext_coeff=hps["ext_coeff"], int_coeff=hps["int_coeff"], dynamics=self.dynamics, args=hps, ) self.agent.to_report["aux"] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report["aux"] self.agent.to_report["dyn_loss"] = tf.reduce_mean( self.dynamics.loss[0]) self.agent.total_loss += self.agent.to_report["dyn_loss"] self.agent.to_report["feat_var"] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self._set_env_vars() self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=hps['feat_dim'], ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['feat_learning']] self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=hps['feat_dim'], layernormalize=hps['layernorm']) self.intrinsic_model = IntrinsicModel if hps[ 'feat_learning'] != 'pix2pix' else UNet self.intrinsic_model = self.intrinsic_model( auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feature_space=hps['feature_space'], nsteps_per_seg=hps['nsteps_per_seg'], feat_dim=hps['feat_dim'], naudio_samples=hps['naudio_samples'], train_discriminator=hps['train_discriminator'], discriminator_weighted=hps['discriminator_weighted'], noise_multiplier=hps['noise_multiplier'], concat=hps['concat'], log_dir=logger.get_dir(), make_video=hps['checkpoint_path'] != '') self.agent = PpoOptimizer(scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], feature_space=hps['feature_space'], intrinsic_model=self.intrinsic_model, log_dir=logger.get_dir(), checkpoint_path=hps['checkpoint_path']) self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] if hps['feature_space'] == 'joint': self.agent.to_report['dyn_visual_loss'] = tf.reduce_mean( self.intrinsic_model.visual_loss) self.agent.to_report['dyn_audio_loss'] = tf.reduce_mean( self.intrinsic_model.audio_loss) self.agent.to_report['discrim_train_loss'] = tf.reduce_mean( self.intrinsic_model.discrim_train_loss) self.agent.to_report['intrinsic_model_loss'] = tf.reduce_mean( self.intrinsic_model.loss) elif hps['train_discriminator']: self.agent.to_report['intrinsic_model_loss'] = tf.reduce_mean( self.intrinsic_model.discrim_train_loss) else: self.agent.to_report['intrinsic_model_loss'] = tf.reduce_mean( self.intrinsic_model.loss) self.agent.total_loss += self.agent.to_report['intrinsic_model_loss'] self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
def main(args): # mpi communicator. comm = MPI.COMM_WORLD rank = comm.Get_rank() # seed. workerseed = args.seed + 10000 * comm.Get_rank() if args.seed is not None else None if workerseed is not None: tc.manual_seed(workerseed % 2 ** 32) np.random.seed(workerseed % 2 ** 32) random.seed(workerseed % 2 ** 32) # logger. if rank == 0: logger.configure() else: logger.configure(format_strs=[]) # env. env = make_atari(args.env_name) env.seed(workerseed) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) print(f"frame_stacking: {args.frame_stacking}") env = wrap_deepmind(env, frame_stack=args.frame_stacking, clip_rewards=(args.mode =='train'), episode_life=(args.mode =='train')) # See Mnih et al., 2015 -> Methods -> Training Details. env.seed(workerseed) # agent. agent = CnnPolicy( img_channels=env.observation_space.shape[-1], num_actions=env.action_space.n, kind=args.model_type) # optimizer and scheduler. max_grad_steps = args.optim_epochs * args.env_steps // (comm.Get_size() * args.optim_batchsize) optimizer = tc.optim.Adam(agent.parameters(), lr=args.optim_stepsize, eps=1e-5) scheduler = tc.optim.lr_scheduler.OneCycleLR( optimizer=optimizer, max_lr=args.optim_stepsize, total_steps=max_grad_steps, pct_start=0.0, anneal_strategy='linear', cycle_momentum=False, div_factor=1.0) # checkpoint. if rank == 0: try: state_dict = tc.load(os.path.join(args.checkpoint_dir, args.model_name, 'model.pth')) agent.load_state_dict(state_dict) print(f"Continuing from checkpoint found at {os.path.join(args.checkpoint_dir, args.model_name, 'model.pth')}") except FileNotFoundError: print("Bad checkpoint or none on process 0. Continuing from scratch.") # sync. with tc.no_grad(): for p in agent.parameters(): p_data = p.data.numpy() comm.Bcast(p_data, root=0) p.data.copy_(tc.tensor(p_data).float()) # operations. if args.mode == 'train': learn(env=env, agent=agent, optimizer=optimizer, scheduler=scheduler, comm=comm, timesteps_per_actorbatch=args.timesteps_per_actorbatch, max_timesteps=args.env_steps, optim_epochs=args.optim_epochs, optim_batchsize=args.optim_batchsize, gamma=args.gamma, lam=args.lam, clip_param=args.epsilon, entcoeff=args.ent_coef, checkpoint_dir=args.checkpoint_dir, model_name=args.model_name) env.close() elif args.mode == 'play': if comm.Get_rank() == 0: play(env=env, agent=agent, args=args) env.close() elif args.mode == 'movie': if comm.Get_rank() == 0: movie(env=env, agent=agent, args=args) env.close() else: raise NotImplementedError("Mode of operation not supported!")
def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self._set_env_vars( ) # 初始化 ob_space,ac_space,ob_mean,ob_std, 初始化 self.envs 包含多个环境模型 self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) # 在建立环境模型之前, 先初始特征提取器. 定义在 auxiliary_task.py 中. 其中 pix2pix 相当于没有提取特征. self.feature_extractor = { "none": FeatureExtractor, # 默认是none "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['feat_learning']] # 通过hps参数选择一个特征提取器 self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) # 初始化 环境模型 的类. 上述定义的 feature_extractor 将作为一个参数传入 self.dynamics = DvaeDynamics(auxiliary_task=self.feature_extractor, reward_type=hps['reward_type'], sample_seeds=hps['sample_seeds']) self.agent = PpoOptimizer( scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics, # dynamic 对象 nepochs_dvae=hps["nepochs_dvae"] # 额外训练 dynamic 的次数 ) # agent 损失: 包括 actor,critic,entropy 损失; 先在加上 feature 学习时包含的损失 self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] # dynamic 损失 self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] # add bai. 单独记录DAVE损失,可能要多次训练DAVE self.agent.dynamics_loss = self.agent.to_report['dyn_loss'] # 计算状态经过辅助任务提取特征的方差, shape=(512,), 下面取 tf.reduce_mean 后是一个标量 self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
def __init__(self, make_env, hps, num_timesteps, envs_per_process, exp_name=None, env_name=None, policy=None, feat_ext=None, dyn=None, agent_num=None, restore_name=None): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.depth_pred = hps['depth_pred'] self.aux_input = hps['aux_input'] self.num_timesteps = num_timesteps self._set_env_vars() if exp_name: self.exp_name = exp_name else: self.exp_name = hps['exp_name'] if env_name: self.env_name = env_name else: self.env_name = hps['env'] if policy is None: if hps['lstm']: self.policy = LSTMPolicy( scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, batchsize=hps['envs_per_process'], feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, lstm1_size=hps['lstm1_size'], lstm2_size=hps['lstm2_size'], layernormalize=False, nl=tf.nn.leaky_relu, depth_pred=hps['depth_pred'], aux_input=hps['aux_input'], ) else: self.policy = CnnPolicy( scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu ) else: self.policy = policy self.policy.restore() if feat_ext: self.feature_extractor = feat_ext self.feature_extractor.restore() else: self.feature_extractor = {"none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels}[hps['feat_learning']] self.feature_extractor = self.feature_extractor(policy=self.policy, features_shared_with_policy=hps['feat_share'], feat_dim=hps['dyn_feat_dim'], layernormalize=hps['layernorm']) if dyn: self.dynamics = dyn self.dynamics.restore() else: self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics(auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=hps['dyn_feat_dim']) self.agent = PpoOptimizer( hps=hps, scope='ppo', ob_space=self.ob_space, env_ob_space=self.env_ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics, exp_name=self.exp_name, env_name=self.env_name, video_log_freq=hps['video_log_freq'], model_save_freq=hps['model_save_freq'], use_apples=hps['use_apples'], agent_num=agent_num, restore_name=restore_name, multi_envs=hps['multi_train_envs'], lstm=hps['lstm'], lstm1_size=hps['lstm1_size'], lstm2_size=hps['lstm2_size'], depth_pred=hps['depth_pred'], aux_input=hps['aux_input'], beta_d=hps['beta'], early_stop=hps['early_stop'], optim=hps['optim'], decay=hps['decay'], grad_clip=hps['grad_clip'], log_grads=hps['log_grads'], logdir=hps['logdir'] ) self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss) self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0,1])[1]) if hps['curiosity']: #self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss) self.agent.total_loss += hps['aux_loss_coeff']*self.agent.to_report['aux'] #self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += hps['dyn_loss_coeff']*self.agent.to_report['dyn_loss']
class Trainer(object): from baselines import logger def __init__(self, make_env, hps, num_timesteps, envs_per_process, exp_name=None, env_name=None, policy=None, feat_ext=None, dyn=None, agent_num=None, restore_name=None): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.depth_pred = hps['depth_pred'] self.aux_input = hps['aux_input'] self.num_timesteps = num_timesteps self._set_env_vars() if exp_name: self.exp_name = exp_name else: self.exp_name = hps['exp_name'] if env_name: self.env_name = env_name else: self.env_name = hps['env'] if policy is None: if hps['lstm']: self.policy = LSTMPolicy( scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, batchsize=hps['envs_per_process'], feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, lstm1_size=hps['lstm1_size'], lstm2_size=hps['lstm2_size'], layernormalize=False, nl=tf.nn.leaky_relu, depth_pred=hps['depth_pred'], aux_input=hps['aux_input'], ) else: self.policy = CnnPolicy( scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu ) else: self.policy = policy self.policy.restore() if feat_ext: self.feature_extractor = feat_ext self.feature_extractor.restore() else: self.feature_extractor = {"none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels}[hps['feat_learning']] self.feature_extractor = self.feature_extractor(policy=self.policy, features_shared_with_policy=hps['feat_share'], feat_dim=hps['dyn_feat_dim'], layernormalize=hps['layernorm']) if dyn: self.dynamics = dyn self.dynamics.restore() else: self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics(auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=hps['dyn_feat_dim']) self.agent = PpoOptimizer( hps=hps, scope='ppo', ob_space=self.ob_space, env_ob_space=self.env_ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics, exp_name=self.exp_name, env_name=self.env_name, video_log_freq=hps['video_log_freq'], model_save_freq=hps['model_save_freq'], use_apples=hps['use_apples'], agent_num=agent_num, restore_name=restore_name, multi_envs=hps['multi_train_envs'], lstm=hps['lstm'], lstm1_size=hps['lstm1_size'], lstm2_size=hps['lstm2_size'], depth_pred=hps['depth_pred'], aux_input=hps['aux_input'], beta_d=hps['beta'], early_stop=hps['early_stop'], optim=hps['optim'], decay=hps['decay'], grad_clip=hps['grad_clip'], log_grads=hps['log_grads'], logdir=hps['logdir'] ) self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss) self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0,1])[1]) if hps['curiosity']: #self.agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss) self.agent.total_loss += hps['aux_loss_coeff']*self.agent.to_report['aux'] #self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += hps['dyn_loss_coeff']*self.agent.to_report['dyn_loss'] #self.agent.to_report['feat_var'] = tf.reduce_mean(tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) def _set_env_vars(self): import numpy as np env = self.make_env(0, add_monitor=False) self.ob_space, self.ac_space = env.observation_space, env.action_space self.env_ob_space = env.observation_space if self.depth_pred: self.ob_space = gym.spaces.Box(0, 255, shape=(84,84,3), dtype=np.uint8) self.ob_mean, self.ob_std = random_agent_ob_mean_std(env, depth_pred=self.hps['depth_pred']) del env self.envs = [functools.partial(self.make_env, i) for i in range(self.envs_per_process)] def train(self, saver, sess, restore=False): from baselines import logger self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) if restore: print("Restoring model for training") saver.restore(sess, "models/" + self.hps['restore_model'] + ".ckpt") print("Loaded model", self.hps['restore_model']) write_meta_graph = False while True: info = self.agent.step() if info['update']: if info['update']['recent_best_ext_ret'] is None: info['update']['recent_best_ext_ret'] = 0 wandb.log(info['update']) logger.logkvs(info['update']) logger.dumpkvs() if self.agent.rollout.stats['tcount'] > self.num_timesteps: break if self.hps['tune_env']: filename = "models/" + self.hps['restore_model'] + "_tune_on_" + self.hps['tune_env'] + "_final.ckpt" else: filename = "models/" + self.hps['exp_name'] + "_final.ckpt" saver.save(sess, filename, write_meta_graph=False) self.policy.save_model(self.hps['exp_name'], 'final') self.agent.stop_interaction()
class Trainer(object): def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self._set_env_vars() self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['feat_learning']] self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics( auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], pred_discount=hps['pred_discount'], num_preds=hps['num_preds'], feat_dim=512) ''' Setting dynamics object in policy for feature extraction''' self.policy.set_dynamics(self.dynamics) self.dynamics.set_loss() self.agent = PpoOptimizer(scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics) self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss1 + self.dynamics.loss2) self.agent.total_loss += self.agent.to_report['dyn_loss'] self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) def _set_env_vars(self): env = self.make_env(0, add_monitor=True) self.ob_space, self.ac_space = env.observation_space, env.action_space self.ob_mean, self.ob_std = random_agent_ob_mean_std(env) del env self.envs = [ functools.partial(self.make_env, i) for i in range(self.envs_per_process) ] def train(self): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) while True: info = self.agent.step() if info['update']: logger.logkvs(info['update']) logger.dumpkvs() if self.agent.rollout.stats['tcount'] > self.num_timesteps: break self.agent.stop_interaction()
class Tester(object): def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self._set_env_vars( ) # 初始化 ob_space,ac_space,ob_mean,ob_std, 初始化 self.envs 包含多个环境模型 self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = FeatureExtractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) # 初始化 环境模型 的类. 上述定义的 feature_extractor 将作为一个参数传入 self.dynamics = DvaeDynamics(auxiliary_task=self.feature_extractor, reward_type=hps['reward_type'], sample_seeds=hps['sample_seeds']) self.agent = PpoOptimizer(scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics, nepochs_dvae=0) # agent 损失: 包括 actor,critic,entropy 损失; 先在加上 feature 学习时包含的损失 self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] # dynamic 损失, 将所有 dynamic 的损失累加起来 self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] # 计算状态经过辅助任务提取特征的方差, shape=(512,), 下面取 tf.reduce_mean 后是一个标量 self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) def _set_env_vars(self): """ 该 env 仅是为了初始化 ob_space, ac_space, ob_mean, ob_std. 因此在算完之后 del 掉. 随后初始化 self.envs_per_process 个 env """ env = self.make_env(0) # ob_space.shape=(84, 84, 4) ac_space.shape=Discrete(4) self.ob_space, self.ac_space = env.observation_space, env.action_space # 随机智能体与环境交互, 计算观测的均值和标准差. ob_mean.shape=(84,84,4), 是0-255之间的数. ob_std是标量, breakout中为 1.8 self.ob_mean, self.ob_std = random_agent_ob_mean_std(env) if self.hps["env_kind"] == "unity": env.close() del env self.envs = [ functools.partial(self.make_env, i) for i in range(self.envs_per_process) ] def play(self, tf_sess, args_tmp, saver, model_path): print("model_path: ", model_path) with tf_sess.as_default(): print("Load wights..") saver.restore(tf_sess, model_path) print("Load done.") # rollout env = self.make_env(0) max_reward = -10000. for i in range(5): obs = env.reset() rews, frames = [], [] while True: obs = np.expand_dims(np.squeeze(obs), axis=0) assert obs.shape == (1, 84, 84, 4) acs, vpreds, nlps = self.policy.get_ac_value_nlp(obs) obs, rew, done, info = env.step(acs[0]) rews.append(rew) obs = np.array(obs) frames.append(env.render(mode='rgb_array')) if done: break if max_reward < np.sum(rews): max_reward = np.sum(rews) print("Max rewards:", max_reward) save_np_as_mp4( frames, "/Users/bai/Desktop/video/" + args_tmp['env'] + '.mp4')
class Scorer(object): def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std( None, hps['env'], nsteps=1, load=True) self.env = self.make_env(258) self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['feat_learning']] self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics( auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512) self.agents = [ # self.create_agent('presub095', hps), self.create_agent('presub089', hps), # self.create_agent('presub088', hps), # self.create_agent('presub087', hps), # self.create_agent('presub047', hps), # self.create_agent('presub018', hps), # self.create_agent('presub001', hps), # self.create_agent('presub002', hps), # self.create_agent('presub004', hps), # self.create_agent('presub005', hps), # self.create_agent('presub015', hps), # self.create_agent('presub016', hps), # self.create_agent('presub017', hps), # self.create_agent('presub019', hps), # self.create_agent('presub020', hps), # self.create_agent('presub021', hps), ] def create_agent(self, exp_name, hps): # graph = tf.Graph() # graph.as_default() agent = PpoOptimizer( scope=exp_name, ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics, load=hps['load'], exp_name=exp_name, ) agent.to_report['aux'] = tf.reduce_mean(self.feature_extractor.loss) agent.total_loss += agent.to_report['aux'] agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) agent.total_loss += agent.to_report['dyn_loss'] agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) # agent.graph = graph # tf.reset_default_graph() return agent def score(self): episode_reward = 0 episode_rewards = [] total_episodes = 0 samples = 0 obs = np.empty((len(self.agents) + samples, 1, *self.ob_space.shape), np.float32) obs[0] = self.env.reset() max_level = 0 max_levels = [] for agent in self.agents: agent.no_mpi_start_interaction([self.env], nlump=self.hps['nlumps'], dynamics=self.dynamics) # if is_grading(self.env): # while not done_grading(self.env): # # run_episode(env) # done = False # episode_reward = 0.0 # while not done: # action = env.action_space.sample() # obs, reward, done, info = env.step(action) # episode_reward += reward # self.env.reset() # return while True: # aa = obs.reshape([len(obs) * 1, 1, *self.ob_space.shape]) for i in range(len(self.agents) - 1): obs[1 + i] = obs[0] for i in range(samples): mu, sigma = 0, 0.1 noise = np.random.normal(mu, sigma, obs[0].shape) obs[len(self.agents) + i] = obs[0] + noise # obs[1] = np.copy(obs[0]) # obs[1] = cv2.randn(obs[1],(128),(9)) action_scores, acs, vpreds, nlps = self.policy.inference_get_ac_value_nlp( obs) max_actions = np.unravel_index(action_scores.argmax(), action_scores.shape) max_action = max_actions[1] max_v = vpreds.argmax() max_npl = nlps.argmax() min_npl = nlps.argmin() action = acs[0] # default # action = int(max_action) # based on highest scoring action # action = int(acs[max_v]) # based on highest value # action = int(acs[min_npl]) # based on min npl # action = action_scores[min_npl].argmax() ob, reward, done, _ = self.env.step(action) obs[0] = ob episode_reward += reward if reward == 1: max_level += 1 if done: episode_rewards.append(episode_reward) ave_reward = sum(episode_rewards) / len(episode_rewards) total_episodes += 1 max_levels.append(max_level) ave_level = sum(max_levels) / len(max_levels) print('ep:', total_episodes, 'level:', max_level, 'ave_level:', round(ave_level, 2), 'episode_reward:', episode_reward, 'ave_reward', round(ave_reward, 2)) episode_reward = 0 max_level = 0 if is_grading(self.env): if done_grading(self.env): break elif total_episodes >= 25: break obs[0] = self.env.reset() self.env.close()
class Scorer(object): def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps # self._set_env_vars() self.ob_mean, self.ob_std, self.ob_space, self.ac_space = random_agent_ob_mean_std( None, hps['env'], nsteps=1, load=True) # env = self.make_env(256, add_monitor=False, sleep_multiple=1./32) # self.ob_space, self.ac_space = env.observation_space, env.action_space # env.close() # del env self.envs = [ functools.partial(self.make_env, i + 256 + 1) for i in range(envs_per_process) ] self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=512, ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['feat_learning']] self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=512, layernormalize=hps['layernorm']) self.dynamics = Dynamics if hps['feat_learning'] != 'pix2pix' else UNet self.dynamics = self.dynamics( auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feat_dim=512) self.agent = PpoOptimizer( scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], dynamics=self.dynamics, load=hps['load'], exp_name=hps['exp_name'], ) self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] self.agent.to_report['dyn_loss'] = tf.reduce_mean(self.dynamics.loss) self.agent.total_loss += self.agent.to_report['dyn_loss'] self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1]) def score(self): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], dynamics=self.dynamics) from time import sleep sleep(2) episode_reward = 0 episode_rewards = [] total_episodes = 0 max_level = 0 max_levels = [] while True: # info = self.agent.step() # self.agent.rollout.collect_rollout() obs, prevrews, news, infos = self.agent.rollout.env_get(0) if prevrews is not None: episode_reward += prevrews if prevrews == 1: max_level += 1 if news: episode_rewards.append(episode_reward) ave_reward = sum(episode_rewards) / len(episode_rewards) total_episodes += 1 max_levels.append(max_level) ave_level = sum(max_levels) / len(max_levels) ave_level = np.around(ave_level, 2) ave_reward = np.around(ave_reward, 2) print('ep:', total_episodes, 'level:', max_level, 'ave_level:', ave_level, 'episode_reward:', episode_reward, 'ave_reward', ave_reward) episode_reward = 0 max_level = 0 if total_episodes >= 25: break # acs, vpreds, nlps = self.agent.rollout.policy.get_ac_value_nlp(obs) # self.agent.rollout.env_step(0, acs) acs, vpreds, nlps = self.policy.get_ac_value_nlp(obs) self.agent.rollout.env_step(0, acs) self.agent.rollout.step_count += 1 self.agent.stop_interaction()