def test_build_runner(): """ instantiate runner """ env = gym.make('FrozenLake-v0') policy = build_policy(env, 'mlp') runner = Runner(env=env, model=policy, nsteps=100, \ gamma=1.0, lamb=1.0) return env, runner
def test_run_episode_Discrete(): """ Take steps in Discrete environment """ env = gym.make('FrozenLake-v0') policy = build_policy(env, 'mlp') runner = Runner(env=env, model=policy) env.reset() runner.run()
def test_run_episode_Box(): """ Take steps in Box environment """ env = gym.make('CartPole-v0') policy = build_policy(env, 'mlp') runner = Runner(env=env, model=policy) env.reset() mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, \ mb_neglogps = runner.run() print(mb_obs)
def test_initial_obs(): """ Check if a runner's obs property can accept new observation - Discrete & Box environments """ # Discrete env env = gym.make('FrozenLake-v0') policy = build_policy(env, 'mlp') runner = Runner(env=env, model=policy) env.reset() obs, _, _, _ = env.step(env.action_space.sample()) runner.obs = torch.tensor(obs) assert (runner.obs.numpy() == obs).all() # Box env env = gym.make('CartPole-v0') policy = build_policy(env, 'mlp') runner = Runner(env=env, model=policy) env.reset() obs, _, _, _ = env.step(env.action_space.sample()) runner.obs = torch.tensor(obs) assert (runner.obs.numpy() == obs).all()
self.obs, reward, self.dones = self._make_tensor(obs, reward, dones) mb_obs.append(self.obs.detach().clone()) mb_rewards.append(reward) mb_dones.append(self.dones) if dones: break return map(tensor_stack, (mb_obs, mb_rewards, mb_actions, mb_values, \ mb_dones, mb_neglogps)) def tensor_stack(lst): """ stack a list of tensors into a tensor along dim=0 (timestep) """ return torch.stack(lst, dim=0) if __name__ == '__main__': import gym from baselines.common.policy_util import build_policy env = gym.make('CartPole-v0') policy = build_policy(env, 'mlp') runner = Runner(env=env, model=policy) env.reset() mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, \ mb_neglogps = runner.run()
def learn(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, minibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, **network_kwargs): """ Main interface function to learn policy by the PPO algorithm Args: network: (str) policy network architecture; e.g., mlp, lstm, cnn, cnn_lstm, etc. env: (Env class object) environment seed: (float) random seed for reproducibility nsteps: (int) number of timesteps of the vectorized environments per training iteration i.e., batch_size (in # of steps) = nstep * num_envs minibatches: (int) number of mini-batches per training iteration total_timesteps: (int) total number of timesteps (i.e., actions) taken in the environment for the training session noptepochs: (int) number of epochs (one update per epoch) per training iterations log_interval: (int) number of timesteps between logging events **network_kwargs: pointer to arguments for policy / value network builder Notes: 1) batch_size, epochs, iterations: -- each new iteration would use a new batch_size of training data gathered by running the environment & collecting statistics - each batch_size of training data is used to do multiple updates per iteration -- example: nsteps = 2048, num_envs = 1 (no parallel copies) -- batch_size = 2048; means 2048 environment steps for each training iteration -- let minibatches = 4; means batch_size_train = 2048 / 4 = 512 timesteps for each training update -- let noptepochs = 4 -- hence for one training iteration: - has 4 epochs - each epoch has 4 minibatches - each minibatch has 512 timesteps - make one update per minibatch - total updates per iteration is thus 4*4 = 16 with total of 2048 timesteps as trainng data """ ### ----------------------------------------------------------------------------------------------------- ### ### step 1: initialize training settings ### ----------------------------------------------------------------------------------------------------- ### # set global random seed (for distributed setting) set_global_seed(seed) # check if MPI is active is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # get total number of timesteps in a training session (hyperparameter) total_timesteps = int(total_timesteps) # get number of environment copies simulated in parallel num_envs = env.num_envs # calculate batch sizes # a) batch_size # -- measured in number of timesteps # -- is the number of timesteps for one training iteration (or timesteps per iteration in some papers) # -- = number of parallel environment copies x number of steps per environment copy per update (hyper-parameter) batch_size = num_envs * nsteps # b) batch_size_train # -- measured in number of timesteps # -- is the number of timesteps for one training update (multiple updates per iteration, specified by noptepochs) # -- = batch_size // minibatches (hyperparameter; is the number of minibatches needed per update) batch_size_train = batch_size // minibatches # make sure batch_size is divisible by minibatches assert batch_size % minibatches == 0, "batch_size is not divisible by minibatches" # c) num_iterations # -- total number of iterations specified for a training session # -- = total_timesteps // batch_size num_iterations = total_timesteps // batch_size # get state and action spaces ob_space = env.observation_space ac_space = env.action_space ### ----------------------------------------------------------------------------------------------------- ### ### step 2: instantiate PPO objects ### ----------------------------------------------------------------------------------------------------- ### # 1) build the policy + value networks by instantiating the policy object # -- policy class is in common & shared by different agents, with agent-specific customizations policy = build_policy(env, network, **network_kwargs) # 2) build the agent algorithm by instantiating the model object if model_fn is None: model_fn = Model model = model_fn() # load pretrained model if specified if load_path is not None: model.load(load_path) # 3) build the episode running framework by instantiating the runner object runner = Runner() if eval_env is not None: eval_runner = Runner() ### ----------------------------------------------------------------------------------------------------- ### ### step 3: main training loop ### ----------------------------------------------------------------------------------------------------- ### # start total timer t_first_start = perf_counter() # loop over training iterations for iteration in range(1, num_iterations + 1): # start timer tstart = perf_counter() # schedule learning rate # schedule clip range # logging events if iteration % log_interval == 0 and is_mpi_root: logger.info("Stepping environment...") # 1. Get a minibatch of statistics # -- by runner.run() obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) # -- evaluation mode if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) # logging events if iteration % log_interval == 0 and is_mpi_root: logger.info("Done.") # accumulate episode info dict # 2. Calculate loss for each minibatch and accumulate mb_lossvals = [] # A) non-recurrent environments (states == None means at terminal state?) if states is None: # create an index array for each timestep in batch_size indices = np.arange(batch_size) # loop over epochs for _ in range(noptepochs): # randomize the indices np.random.shuffle(indices) # loop over the training data from 0 to batch_size in steps of batch_size_train for start in range(0, batch_size, batch_size_train): # calculate end index end = start + batch_size_train # make an index slice of training data for a minibatch in size batch_size_train mb_indices = indices[start:end] # get a minibatch slice of training data slices = (arr[mb_indices] for arr in (obs, returns, masks, actions, values, neglogpacs)) # accumulate loss mb_lossvals.append(model.train(*slices)) # B) recurrent environments (no terminal states?) else: # Q: why assert this? this won't hold if num_envs = 1 assert num_envs % minibatches == 0, "num_envs is not divisible by minibatches" envs_per_batch = num_envs // minibatches # create an index array for each parallel environment copy env_indices = np.arange(num_envs) # create a flattened index matrix for all timesteps in all environment copies flat_indices = np.arange(num_envs * nsteps).reshape( num_envs, nsteps) # loop over epochs for _ in range(noptepochs): # randomize environment indices np.random.shuffle(env_indices) # loop over the environments from 0 to num_envs in steps of envs_per_batch for start in range(0, num_envs, envs_per_batch): # calculate end environment index end = start + envs_per_batch # make an index slice of environments in size envs_per_batch mb_env_indices = env_indices[start:end] # make an index slice of flattened timesteps mb_flat_indices = flat_indices[mb_env_indices].ravel() # slice training data slices = (arr[mb_flat_indices] for arr in (obs, returns, masks, actions, values, neglogpacs)) # slice states mb_states = states[mb_env_indices] # accumulate loss mb_lossvals.append(model.train(*slices, mb_states)) # 3. Update networks # average training losses over minibatches lossvals = np.mean(mb_lossvals, axis=0) # end timer tnow = perf_counter() # calculate frames per second (fps) # - treat each timestep as one frame fps = int(batch_size / (tnow - tstart)) if update_fn is not None: update_fn(iteration) # 4. Logistics # -- log events if iteration % log_interval == 0 or iteration == 1: ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", iteration * nsteps) logger.logkv("misc/num_iterations", iteration) logger.logkv("misc/total_timesteps", iteration * batch_size) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv("misc/time_elapsed", tnow - t_first_start) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv("loss/" + lossname, lossval) logger.dumpkvs() # -- save checkpoints if save_interval and (iteration % save_interval == 0 or iteration == 1) and logger.get_dir() and is_mpi_root: checkdir = os.path.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = os.path.join(checkdir, '%.5i' % iteration) print("Saving to {}".format(savepath)) model.save(savepath) return model