def test_env_after_learn(algo): def make_env(): # acktr requires too much RAM, fails on travis env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4') return env make_session(make_default=True, graph=tf.Graph()) env = SubprocVecEnv([make_env]) learn = get_learn_function(algo) # Commenting out the following line resolves the issue, though crash happens at env.reset(). learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None) env.reset() env.close()
def train(env_id, num_frames, seed, num_cpu): num_timesteps = int(num_frames / 4 * 1.1) def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) if logger.get_dir(): env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) policy_fn = CnnPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu) env.close()
def train(env_id, num_timesteps=300, seed=0, num_env=2, renderer='tiny'): def make_env(rank): def _thunk(): if env_id == "TestEnv": env = TestEnv(renderer=renderer) #gym.make(env_id) else: env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) # only clip rewards when not evaluating return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_env)]) env.reset() start = time.time() for i in range(num_timesteps): action = [env.action_space.sample() for _ in range(num_env)] env.step(action) stop = time.time() duration = (stop - start) if (duration): fps = num_timesteps / duration else: fps = 0 env.close() return num_env, fps
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu): num_timesteps = int(num_frames / 4 * 1.1) # divide by 4 due to frameskip, then do a little extras so episodes end def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule) env.close()
def register_and_create_Envs(id_tmp_dir, seed, environment, rl_setting): """ Register environment, create vector of n_e environments and return it. Args: id_temp_dir (str): Working directory. All other args are automatically provided by sacred by passing the equally named configuration variables that are either defined in the yaml files or the command line. """ if environment['entry_point']: try: register( id=environment['name'], entry_point=environment['entry_point'], kwargs=environment['config'], max_episode_steps=environment['max_episode_steps'] ) except Exception: pass envs = [make_env(environment['name'], seed, i, id_tmp_dir, frameskips_cases=environment['frameskips_cases']) for i in range(rl_setting['num_processes'])] # Vectorise envs if rl_setting['num_processes'] > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) # Normalise rewards. Unnecessary for Atari, unwanted for Mountain Hike. # Probably useful for MuJoCo? # if len(envs.observation_space.shape) == 1: if environment['vec_norm']: envs = VecNormalize(envs) return envs
def make_doom_env(num_env, seed, name): def make_env(rank): # pylint: disable=C0111 def _thunk(): if name == 'shoot': env = ShootEnv() env.seed(rank) elif name == 'navi': env = NaviEnv() env.seed(rank) elif name == 'consnavi': env = ConservativeNaviEnv() env.seed(rank) elif name == 'mixed': env = MixedEnv() env.seed(rank) elif name == 'dodge': env = DodgeEnv() env.seed(rank) elif name == 'upfloor': env = UpFloorEnv() env.seed(seed + rank) elif name == 'finddoor': env = FindDoorEnv() env.seed(rank) elif name == 'gather': env = GatherEnv() env.seed(rank) else: print('Invalid env name') #For finddoor env #env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return env return _thunk set_global_seeds(seed) return SubprocVecEnv([make_env(i) for i in range(num_env)])
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, env_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None, initializer=None, force_dummy=False): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ wrapper_kwargs = wrapper_kwargs or {} env_kwargs = env_kwargs or {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = seed + 10000 * mpi_rank if seed is not None else None logger_dir = logger.get_dir() def make_thunk(rank, initializer=None): return lambda: make_env( env_id=env_id, env_type=env_type, mpi_rank=mpi_rank, subrank=rank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations=flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, env_kwargs=env_kwargs, logger_dir=logger_dir, initializer=initializer ) set_global_seeds(seed) if not force_dummy and num_env > 1: return SubprocVecEnv([make_thunk(i + start_index, initializer=initializer) for i in range(num_env)]) else: return DummyVecEnv([make_thunk(i + start_index, initializer=None) for i in range(num_env)])
def main(): """Run PPO until the environment throws an exception.""" config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 env_fns, env_names = create_envs() with tf.Session(config=config): # Take more timesteps than we need to be sure that # we stop due to an exception. ppo2.learn(policy=policies.CnnPolicy, env=SubprocVecEnv(env_fns), nsteps=4096, nminibatches=8, lam=0.95, gamma=0.99, noptepochs=3, log_interval=1, ent_coef=0.01, lr=lambda _: 2e-4, cliprange=lambda _: 0.1, total_timesteps=int(1e9), save_interval=10, save_path='./checkpoints_joint_ppo2', load_path=None)
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0): """ Create a wrapped, monitored SubprocVecEnv for Atari. """ if wrapper_kwargs is None: wrapper_kwargs = {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 def make_env(rank): # pylint: disable=C0111 def _thunk(): env = make_atari(env_id) env.seed(seed + 10000 * mpi_rank + rank if seed is not None else None) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank))) return wrap_deepmind(env, **wrapper_kwargs) return _thunk set_global_seeds(seed) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0): """ Create a wrapped, monitored SubprocVecEnv for Atari. """ if wrapper_kwargs is None: wrapper_kwargs = {} def make_env(rank): # pylint: disable=C0111 def _thunk(): env = make_atari(env_id) env.seed(seed + rank) # Monitor is a wrapper of gym env, 对环境Env进行封装, 主要添加了对episode结束时信息的记录。 env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return wrap_deepmind(env, **wrapper_kwargs) return _thunk set_global_seeds(seed) # SubproVecEnv 将上面创建好的函数(_thunk)放到各个子进程中去执行 # (i + start_index) 传入不同的seed return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, gamestate=None): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ if wrapper_kwargs is None: wrapper_kwargs = {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = seed + 10000 * mpi_rank if seed is not None else None def make_thunk(rank): return lambda: make_env( env_id=env_id, env_type=env_type, subrank = rank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, wrapper_kwargs=wrapper_kwargs ) set_global_seeds(seed) if num_env > 1: return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)]) else: return DummyVecEnv([make_thunk(start_index)])
def make_cartpole_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0): """ Create a wrapped, monitored SubprocVecEnv for CartPole. """ if wrapper_kwargs is None: wrapper_kwargs = {} def make_env(rank): # pylint: disable=C0111 def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return env return _thunk set_global_seeds(seed) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
def make_vec_envs(env_name, seed, num_processes, gamma, log_dir, add_timestep, device, allow_early_resets): envs = [ make_env(env_name, seed, i, log_dir, add_timestep, allow_early_resets) for i in range(num_processes) ] if len(envs) > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: if gamma is None: envs = VecNormalize(envs, ret=False) else: envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs, device) if len(envs.observation_space.shape) == 3: envs = VecPyTorchFrameStack(envs, 4, device) return envs
def make_obstacle_tower(num, seed=0, show=False): assert ObstacleTowerEnv is not None,\ 'install https://github.com/Unity-Technologies/obstacle-tower-env' def make_env(rank): def _thunk(): env = ObstacleTowerEnv('../ObstacleTower/obstacletower', retro=True, worker_id=rank, realtime_mode=show, config={'total-floors': 20}) env.seed(seed + rank % 8) env = bench.Monitor(env, None, allow_early_resets=True) env = OTWrapper(env) env = FrameStack(env, 4) return env return _thunk envs = [make_env(i) for i in range(num)] envs = SubprocVecEnv(envs, context='fork') envs = VecPyTorch(envs) return envs
def make_neyboy_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0, allow_early_resets=False, frame_skip=4, save_video=False): """ Create a wrapped, monitored SubprocVecEnv for Neyboy. """ if wrapper_kwargs is None: wrapper_kwargs = {} def make_env(rank): def _thunk(): env = make_neyboy_environment(env_id, seed, rank, allow_early_resets, frame_skip=frame_skip, save_video=save_video) # env = Cropper(env) env = WarpFrame(env) return env return _thunk set_global_seeds(seed) envs = [make_env(i + start_index) for i in range(num_env)] if num_env > 1: env = SubprocVecEnv(envs) else: env = DummyVecEnv(envs) return env
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ if wrapper_kwargs is None: wrapper_kwargs = {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 def make_env(rank): # pylint: disable=C0111 def _thunk(): env = make_atari(env_id) if env_type == 'atari' else gym.make( env_id) env.seed(seed + 10000 * mpi_rank + rank if seed is not None else None) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)), allow_early_resets=True) if env_type == 'atari': return wrap_deepmind(env, **wrapper_kwargs) elif reward_scale != 1: return RewardScaler(env, reward_scale) else: return env return _thunk set_global_seeds(seed) if num_env > 1: return SubprocVecEnv( [make_env(i + start_index) for i in range(num_env)]) else: return DummyVecEnv([make_env(start_index)])
def make_dm_control(domain_name, task_name, num_env, seed, frame_stack, vis_reward=False, wrapper_kwargs={}, start_index=0): """ Create a wrapped, monitored SubprocVecEnv for Atari. """ if wrapper_kwargs is None: wrapper_kwargs = {} def wrap_env(seed): env = suite.load(domain_name, task_name, task_kwargs={'random': seed}, visualize_reward=vis_reward) env = pixels.Wrapper(env, pixels_only=False) env = MakeGym(env) env = WarpFrame(env, keep_obs=True) env = FrameStack(env, frame_stack, keep_obs=True) return env def make_env(rank): # pylint: disable=C0111 def _thunk(): env = wrap_env(seed + rank) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return env return _thunk set_global_seeds(seed) return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
def train(_): """Trains a PPO2 policy.""" num_envs = 8 # number to run in parallel vec_env = SubprocVecEnv([(lambda _i=i: create_multiagent_env(_i)) for i in range(num_envs)], context=None) # Import tensorflow after we create environments. TF is not fork sake, and # we could be using TF as part of environment if one of the players is # controled by an already trained model. ncpu = multiprocessing.cpu_count() config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=ncpu, inter_op_parallelism_threads=ncpu) config.gpu_options.allow_growth = True tf.Session(config=config).__enter__() ppo2.learn( network='gfootball_impala_cnn', total_timesteps=1e6 + 1, env=vec_env, seed=0, nsteps=128, nminibatches=8, noptepochs=2, max_grad_norm=0.64, gamma=0.993, ent_coef=0.003, lr=0.000343, log_interval=10, save_interval=10, cliprange=0.8, load_path= '/Users/stephen/Documents/football/checkpoints/11_vs_11_easy_stochastic_v2' )
def make_vec_envs(env_name, seed, num_processes, gamma, log_dir, add_timestep, device, allow_early_resets, num_frame_stack=None, new_wrapper=False, clip_rewards=False, primitive_reward=False): envs = [ make_env(env_name, seed, i, log_dir, add_timestep, allow_early_resets, new_wrapper, clip_rewards, primitive_reward) for i in range(num_processes) ] if len(envs) > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: if gamma is None: envs = VecNormalize(envs, ret=False) else: envs = VecNormalize(envs, gamma=gamma) envs = VecPyTorch(envs, device) if num_frame_stack is not None: envs = VecPyTorchFrameStack(envs, num_frame_stack, device) elif len(envs.observation_space.shape) == 3: envs = VecPyTorchFrameStack(envs, 4, device) return envs
def test(num_env_steps, num_processes, log_dir, env_name, model_name, save_dir): records = [] epoch = 0 envs = [make_env(rank = i) for i in range(num_processes)] if len(envs) > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) try: state_shape = envs.observation_space.shape[0] action_shape = envs.action_space.shape[0] model = model_dict[model_name](state_shape, action_shape) state_dict = torch.load(os.path.join(save_dir, model_name,env_name, model_name+'_Final.pt')) model.load_state_dict(state_dict) state = envs.reset() returns = 0 for t in range(num_env_steps//num_processes): action, log_prob = model.act(state) next_state, reward, done, info = envs.step(to_np(action)) returns += reward for i, d in enumerate(done): if d: records.append(returns[i]) returns[i] = 0 epoch += 1 if epoch >= 100: break state = next_state records = np.array(records) print("# of epoch: {0}".format(epoch)) print("mean: {0}".format(np.mean(records))) print("std: {0}".format(np.std(records))) print("max: {0}".format(np.max(records))) print("min: {0}".format(np.min(records))) print("median: {0}".format(np.median(records))) except Exception as e: traceback.print_exc() finally: envs.close()
def my_make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0): if wrapper_kwargs is None: wrapper_kwargs = {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 def make_env(rank): # pylint: disable=C0111 def _thunk(): env = ProstheticsEnv(visualize=False) env.seed(seed + 10000 * mpi_rank + rank if seed is not None else None) env = ForceDictObservation(env) env = DictToListFull(env) env = JSONable(env) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank))) if reward_scale != 1: return RewardScaler(env, reward_scale) else: return env return _thunk set_global_seeds(seed) if num_env > 1: return SubprocVecEnv( [make_env(i + start_index) for i in range(num_env)]) else: return DummyVecEnv([make_env(start_index)])
def main(): config = tf.ConfigProto() os.environ["CURA_VISIBLE_DEVICES"] = "0" config.gpu_options_allow_growth = True with tf.Session(Config=config): model.learn( policy=policies.A2CPolicy, env=SubprocVecEnv([ env.make_train_0, env.make_train_1, env.make_train_2, env.make_train_3, env.make_train_4, env.make_train_5, env.make_train_6, env.make_train_7, env.make_train_8, env.make_train_9, env.make_train_10, env.make_train_11, env.make_train_12 ]), nsteps=2048, # Steps per environment total_timesteps=10000000, gamma=0.99, lam=0.95, vf_coef=0.5, ent_coef=0.01, lr=2e-4, max_grad_norm=0.5, log_interval=10)
def create_env_vec(self, env_id, seed, num_workers): # divide by 4 due to frameskip, then do a little extras so episodes end def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor( env, self.save_path and os.path.join( self.save_path, "{}.monitor.json".format(rank))) if env_id.startswith('CartPole') or env_id.startswith( 'Acrobot'): env = NumpyWrapper(env) elif env_id.startswith('MountainCar'): env = MountainCarNumpyWrapper(env) elif 'NoFrameskip' in env.spec.id: env = wrap_deepmind(env) return env return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_workers)]) return env
def make_env(self, env_id, seed, id=None, num_processes=None, force_new=True): if id in self.envs: ''' if force_new or env_id != self.env_id or self.num_envs != num_processes: self.close() else: print('env existed, use created env') return True ''' self.close(id) if num_processes is None: num_processes = cpu_count() self.envs[id] = SubprocVecEnv([ make_env(env_id, seed, rank=i, log_dir=None, visualize=False) for i in range(num_processes) ]) print('Started! env_id:{}, seed:{}, num_processes:{}, id:{}'.format( env_id, seed, num_processes, id)) return True
def make_vec_env(env_id, env_type, num_env, seed, prioritize, n_active_envs, wrapper_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None, ): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ wrapper_kwargs = wrapper_kwargs or {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = seed + 10000 * mpi_rank if seed is not None else None def make_thunk(rank): return lambda: make_env( env_id=env_id, env_type=env_type, subrank=rank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations=flatten_dict_observations, wrapper_kwargs=wrapper_kwargs ) set_global_seeds(seed) if num_env > 1: if prioritize: return ModifiedSubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)], n_active_envs=n_active_envs) else: return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)]) else: return DummyVecEnv([make_thunk(start_index)])
def make_aai_env(env_directory, num_env, arenas_configurations, start_index=0): """ Create a wrapped, monitored Unity environment. """ def make_env(rank, arena_configuration): # pylint: disable=C0111 def _thunk(): env = AnimalAIGym( environment_filename=env_directory, worker_id=rank, flatten_branched=True, arenas_configurations=arena_configuration, uint8_visual=True, ) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) return env return _thunk return SubprocVecEnv([ make_env(i + start_index, arenas_configurations) for i in range(num_env) ])
def test_env_after_learn(algo): def make_env(): env = gym.make('PongNoFrameskip-v4') return env make_session(make_default=True, graph=tf.Graph()) env = SubprocVecEnv([make_env]) learn = get_learn_function(algo) network = cnn(one_dim_bias=True) # Commenting out the following line resolves the issue, though crash happens at env.reset(). learn(network=network, env=env, total_timesteps=0, load_path=None, seed=None) env.reset() env.close()
def train(env_id, num_frames, seed, policy, lrschedule, num_cpu): num_timesteps = int(num_frames / 4 * 1.1) # divide by 4 due to frameskip, then do a little extras so episodes end def make_env(rank): def _thunk(): env = gym.make(env_id) env.seed(seed + rank) env = gym.wrappers.Monitor(env, directory='/home/vasu/Desktop/a2c_json', force=True) env = bench.Monitor( env, logger.get_dir() and os.path.join( logger.get_dir(), "{}.monitor.json".format(rank))) env.reset() env.render() gym.logger.setLevel(logging.WARN) return wrap_deepmind(env) return _thunk set_global_seeds(seed) env = SubprocVecEnv([make_env(i) for i in range(num_cpu)]) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = LstmPolicy elif policy == 'lnlstm': policy_fn = LnLstmPolicy learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule) env.reset() env.close()
def train(self): #my laptop only has 8 cores and I generally use 8 actors for stuff, so make sure that the multiprocessing module doesn't try to give each actor multiple threads and make them fight os.environ['OMP_NUM_THREADS'] = '1' #make the environments and set them to run in parallel #thank you OpenAI for doing the multiprocessing stuff for me envs = [self.make_env(self.env_name, 42, n) for n in range(self.N)] envs = SubprocVecEnv(envs) obs_shape = envs.observation_space.shape #create policy network and set it to training mode entry_obs_shape = (obs_shape[0] * self.num_stack, *obs_shape[1:]) self.policy = Policy(entry_obs_shape, envs.action_space) self.policy.train() #create storage for past actions rollouts = RolloutStorage() #set optimizer for updating the weights of our network optimizer = optim.Adam(self.policy.parameters(), lr=self.lr, eps=self.eps) #load saved weights if you can if os.path.isfile(self.filename): print("loading saved params") self.policy.load_state_dict(torch.load(self.filename)) #init some variables to track how much reward we're getting episode_rewards = torch.zeros([self.N, 1]) final_rewards = torch.zeros([self.N, 1]) #init the stack #with most things we won't stack inputs, but having a 'num_stack' works the same as not having a stack at all so we good stacked_s = torch.zeros(self.N, self.num_stack * obs_shape[0], *obs_shape[1:]) s = envs.reset() stacked_s = update_stacked_s(stacked_s, s, obs_shape) #start the training for iter in range(self.iters): #go through some timesteps for step in range(self.T): #get the predicted action and how sure the network is of taking that action #get the predicted value of our current state too with torch.no_grad(): a, log_p, v = self.policy(stacked_s) #transform the action so it's only 1 dimension a_np = a.squeeze(1).cpu().numpy() #step through the environment and observe what happens s2, r, done, _ = envs.step(a_np) #reshape the rewards so they're all in separate rows #each actor has its own row r = torch.from_numpy(r).view(-1, 1).float() episode_rewards += r #set a mask for this state #we'll use this calculate returns and update the stack #if we're done, the mask is 0 -> this'll make returns stop cumulating at this point and it'll clear past actions from the stack so those past actions don't confuse the network #we should apply the mask to the stack after we've stored it (so we don't mess up the data we're currently using), so we don't do it just yet #I struggled with that last part for a bit, so imagine you're playing pong with frame stacking. Once the env resets, the last frames of the previous game don't affect you at all so they shouldnt be used to predict what comes next mask = torch.FloatTensor([[0.0] if d else [1.0] for d in done]) #store the data from this state #since stacked_s is declared at a higher scope, chaning its value in the training loop will change all the stored stacked_s values unless you store a copy of it instead rollouts.add(deepcopy(stacked_s), log_p, v, a, r, mask) #clears the stack if the env is done #there's no point in resetting the stack if there's only 1 value in it. the value will get reset in a few lines anyway so why do unnecessary math if self.num_stack > 1: stacked_s *= mask #keep track of those rewards final_rewards *= mask final_rewards += (1 - mask) * episode_rewards episode_rewards *= mask #update stacked_s s = s2 stacked_s = update_stacked_s(stacked_s, s, obs_shape) #predict one more value so we can calculate returns and advantages with torch.no_grad(): next_v = self.policy.get_value(stacked_s) rollouts.compute_adv_and_returns(next_v, self.gamma, self.tau, self.eps) #optimization epochs for epoch in range(self.epochs): #get the minibatches data = rollouts.get_mb(self.num_mb, self.N, self.T) #loop through the minibatches for sample in data: s_mb, log_p_old_mb, a_mb, returns_mb, adv_mb = sample log_p_mb, v_mb, entropy = self.policy.eval_a(s_mb, a_mb) #calculate the surrogate function #https://arxiv.org/pdf/1707.06347.pdf ratio = torch.exp(log_p_mb - log_p_old_mb) f1 = ratio * adv_mb f2 = torch.clamp(ratio, 1 - self.clip, 1 + self.clip) * adv_mb #calculate the loss #policy loss is based on the surrogate policy_loss = -torch.min(f1, f2).mean() #value loss is mean squared error of the returns and the predicted values value_loss = torch.pow(returns_mb - v_mb, 2).mean() * self.value_loss_coef #entropy loss isn't really loss -> it subtracts from the loss to promote exploration entropy_loss = (entropy * self.entropy_coef) loss = policy_loss + value_loss - entropy_loss #backprop and update weights optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.policy.parameters(), self.max_grad_norm) optimizer.step() #clear storage rollouts.reset() #update plots total_num_steps = (iter + 1) * self.N * self.T if iter % self.vis_iter == self.vis_iter - 1: xs.append(total_num_steps) graph_rewards = final_rewards.view(1, -1) mean_r = graph_rewards.mean().item() median_r = graph_rewards.median().item() min_r = torch.min(graph_rewards).item() max_r = torch.max(graph_rewards).item() std_r = graph_rewards.std().item() medians.append(median_r) first_quartiles.append(np.percentile(graph_rewards.numpy(), 25)) third_quartiles.append(np.percentile(graph_rewards.numpy(), 75)) mins.append(min_r) maxes.append(max_r) means.append(mean_r) stds.append(std_r) losses.append(loss.item()) self.visualizer.update_viz_median(xs, medians, first_quartiles, third_quartiles, mins, maxes, self.graph_colors, self.env_name, self.win_name) self.visualizer.update_viz_mean(xs, means, stds, self.graph_colors[1:], self.env_name, self.win_name) self.visualizer.update_viz_loss(xs, losses, self.graph_colors[2], self.env_name, self.win_name) #log the current data if iter % self.log_iter == self.log_iter - 1: print("iter: %d, steps: %d -> mean: %.1f, median: %.1f / min: %.1f, max: %.1f / policy loss: %.3f, value loss: %.1f, entropy loss: %.3f" % (iter + 1, total_num_steps, mean_r, median_r, min_r, max_r, policy_loss, value_loss, entropy_loss)) #save current weights if iter % self.save_iter == self.save_iter - 1: torch.save(self.policy.state_dict(), self.filename) print("params saved") #save current weights when we're all done torch.save(self.policy.state_dict(), self.filename) print("params saved")
def main(): print("#######") print("WARNING: All rewards are clipped so you need to use a monitor (see envs.py) or visdom plot to get true rewards") print("#######") os.environ['OMP_NUM_THREADS'] = '1' print (args.cuda) print (args.num_steps) print (args.num_processes) print (args.lr) print (args.eps) print (args.alpha) print (args.use_gae) print (args.gamma) print (args.tau) print (args.value_loss_coef) print (args.entropy_coef) # fsdaf # Create environment envs = SubprocVecEnv([ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ]) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(envs.observation_space.shape) == 3: actor_critic = CNNPolicy(obs_shape[0], envs.action_space) else: actor_critic = MLPPolicy(obs_shape[0], envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] # action_shape = action_shape # shape_dim0 = envs.observation_space.shape[0] # if args.cuda: # dtype = torch.cuda.FloatTensor # else: # dtype = torch.FloatTensor hparams = {'cuda':args.cuda, 'num_steps':args.num_steps, 'num_processes':args.num_processes, 'obs_shape':obs_shape, 'lr':args.lr, 'eps':args.eps, 'alpha':args.alpha, 'use_gae':args.use_gae, 'gamma':args.gamma, 'tau':args.tau, 'value_loss_coef':args.value_loss_coef, 'entropy_coef':args.entropy_coef} # Create agent # agent = a2c(envs, hparams) # rollouts = RolloutStorage(self.num_steps, self.num_processes, self.obs_shape, envs.action_space) #it has a self.state that is [steps, processes, obs] #steps is used to compute expected reward if args.cuda: actor_critic.cuda() # rollouts.cuda() optimizer = optim.RMSprop(actor_critic.parameters(), hparams['lr'], eps=hparams['eps'], alpha=hparams['alpha']) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space) # Init state current_state = torch.zeros(args.num_processes, *obs_shape)#.type(dtype) def update_current_state(state):#, shape_dim0): shape_dim0 = envs.observation_space.shape[0] state = torch.from_numpy(state).float() if args.num_stack > 1: current_state[:, :-shape_dim0] = current_state[:, shape_dim0:] current_state[:, -shape_dim0:] = state # return current_state state = envs.reset() update_current_state(state)#, shape_dim0) # agent.insert_first_state(current_state) rollouts.states[0].copy_(current_state) #set the first state to current state # These are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_state = current_state.cuda()#type(dtype) # if args.cuda: rollouts.cuda() #Begin training start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Act # action, value = agent.act(Variable(agent.rollouts.states[step], volatile=True)) value, action = actor_critic.act(Variable(rollouts.states[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Observe reward and next state state, reward, done, info = envs.step(cpu_actions) # state:[nProcesss, ndims, height, width] # Record rewards # reward, masks, final_rewards, episode_rewards, current_state = update_rewards(reward, done, final_rewards, episode_rewards, current_state) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. # these final rewards are only used for printing. but the mask is used in the storage, dont know why yet # oh its just clearing the env that finished, and resetting its episode_reward masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #if an env is done final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_state.dim() == 4: current_state *= masks.unsqueeze(2).unsqueeze(2) else: current_state *= masks # return reward, masks, final_rewards, episode_rewards, current_state # Update state update_current_state(state)#, shape_dim0) # Agent record step # agent.insert_data(step, current_state, action.data, value.data, reward, masks) rollouts.insert(step, current_state, action.data, value.data, reward, masks) #Optimize agent # agent.update() next_value = actor_critic(Variable(rollouts.states[-1], volatile=True))[0].data # use last state to make prediction of next value if hasattr(actor_critic, 'obs_filter'): actor_critic.obs_filter.update(rollouts.states[:-1].view(-1, *obs_shape)) #not sure what this is rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) # this computes R = r + r+ ...+ V(t) for each step values, action_log_probs, dist_entropy = actor_critic.evaluate_actions( Variable(rollouts.states[:-1].view(-1, *obs_shape)), Variable(rollouts.actions.view(-1, action_shape))) # I think this aciton log prob could have been computed and stored earlier # and didnt we already store the value prediction??? values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() optimizer.step() rollouts.states[0].copy_(rollouts.states[-1]) # the first state is now the last state of the previous # #Save model # if j % args.save_interval == 0 and args.save_dir != "": # save_path = os.path.join(args.save_dir, args.algo) # try: # os.makedirs(save_path) # except OSError: # pass # # A really ugly way to save a model to CPU # save_model = actor_critic # if args.cuda: # save_model = copy.deepcopy(actor_critic).cpu() # torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) #Print updates if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps # print("Updates {}, n_timesteps {}, FPS {}, mean/median R {:.1f}/{:.1f}, min/max R {:.1f}/{:.1f}, T:{:.4f}".#, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". # format(j, total_num_steps, # int(total_num_steps / (end - start)), # final_rewards.mean(), # final_rewards.median(), # final_rewards.min(), # final_rewards.max(), # end - start))#, -dist_entropy.data[0], # # value_loss.data[0], action_loss.data[0])) # print("Upts {}, n_timesteps {}, min/med/mean/max {:.1f}/{:.1f}/{:.1f}/{:.1f}, FPS {}, T:{:.1f}". # format(j, total_num_steps, # final_rewards.min(), # final_rewards.median(), # final_rewards.mean(), # final_rewards.max(), # int(total_num_steps / (end - start)), # end - start)) if j % (args.log_interval*30) == 0: print("Upts, n_timesteps, min/med/mean/max, FPS, Time") print("{}, {}, {:.1f}/{:.1f}/{:.1f}/{:.1f}, {}, {:.1f}". format(j, total_num_steps, final_rewards.min(), final_rewards.median(), final_rewards.mean(), final_rewards.max(), int(total_num_steps / (end - start)), end - start))
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") torch.set_num_threads(1) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.add_timestep) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) actor_critic = Policy(obs_shape, envs.action_space, args.recurrent_policy) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() lmdb_idx = 0 try: os.makedirs(os.path.join(args.lmdb_path, args.env_name)) os.makedirs(os.path.join(args.lmdb_path, args.env_name, 'test')) except: print('Directory already exists.') for j in range(num_updates): for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, states = actor_critic.act( rollouts.observations[step], rollouts.states[step], rollouts.masks[step]) cpu_actions = action.squeeze(1).cpu().numpy() # Observe reward and next obs # obs, reward, done, info = envs.step(cpu_actions) '''unwrapped obs, reward''' obs, reward, done, info, wr_obs, wr_reward = envs.step(cpu_actions) # sample images # img = np.squeeze(np.transpose(obs[3], (1, 2, 0)), 2) for img, rwd in zip(wr_obs, wr_reward): if rwd > 0: lmdb_idx += 1 convert_to_lmdb( img, rwd, os.path.join(args.lmdb_path, args.env_name), lmdb_idx) # Evaluate unwrapped rewards # model = Model() # model.load(args.digit_checkpoint) # model.cuda() # accuracy = digit_eval(image, length_labels, digits_labels, model) # img.show() reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(current_obs, states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.observations[-1], rollouts.states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy, value_loss, action_loss)) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_frames) except IOError: pass
def load_new_screen(self): self.screen = SubprocVecEnv([make_env(*self.screen_name)])
class AtariRAMEnvironment(RawEnvironment): ''' generates the necessary components from the atari environment, including the object dictionary and other components ''' def __init__(self, env_id, seed, rank, log_dir): try: os.makedirs(log_dir) except OSError: pass self.screen_name = (env_id, seed, rank, log_dir) self.screen = SubprocVecEnv([make_env(env_id, seed, rank, log_dir)]) self.num_actions = self.screen.action_space.n self.itr = 0 self.save_path = "" self.factor_state = None self.reward = 0 self.current_raw = np.squeeze(self.screen.reset()) self.current_action = 0 # self.focus_model.cuda() def load_new_screen(self): self.screen = SubprocVecEnv([make_env(*self.screen_name)]) def set_save(self, itr, save_dir, recycle): self.save_path = save_dir self.itr = itr self.recycle = recycle try: os.makedirs(save_dir) except OSError: pass def step(self, action): # TODO: action is tensor, might not be safe assumption # t = time.time() uaction = pytorch_model.unwrap(action.long()) raw_state, reward, done, info = self.screen.step([uaction]) # a = time.time() # print("screen step", a - t) raw_state = np.squeeze(raw_state) # raw_state[:10,:] = 0.0 self.current_raw = raw_state raw_factor_state = {'Action': [[0.0, 0.0], (float(uaction), )]} self.current_action = action self.reward = reward[0] self.factor_state = raw_factor_state self.last_action = uaction # logging if len(self.save_path) > 0: if self.recycle > 0: state_path = os.path.join( self.save_path, str((self.itr % self.recycle) // 2000)) count = self.itr % self.recycle else: state_path = os.path.join(self.save_path, str(self.itr // 2000)) count = self.itr try: os.makedirs(state_path) except OSError: pass if self.itr != 0: object_dumps = open( os.path.join(self.save_path, "focus_dumps.txt"), 'a') else: object_dumps = open( os.path.join(self.save_path, "focus_dumps.txt"), 'w') # create file if it does not exist for key in factor_state.keys(): writeable = list(factor_state[key][0]) + list( factor_state[key][1]) object_dumps.write( key + ":" + " ".join([str(fs) for fs in writeable]) + "\t") # TODO: attributes are limited to single floats object_dumps.write( "\n") # TODO: recycling does not stop object dumping # imio.imsave(os.path.join(state_path, "state" + str(count % 2000) + ".png"), self.current_raw) self.itr += 1 # print("elapsed ", time.time() - t) return raw_state, self.factor_state, done def getState(self): raw_state = self.current_raw raw_factor_state = {'Action': self.current_action} if self.factor_state is None: factor_state = dict() factor_state['Action'] = raw_factor_state['Action'] self.factor_state = factor_state factor_state = self.factor_state return raw_state, factor_state
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) obs_numel = reduce(operator.mul, obs_shape, 1) if len(obs_shape) == 3 and obs_numel > 1024: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_numel, envs.action_space) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass