def demonstrate(network, env, nsteps, mvs, load_path, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, mpi_rank_weight=1, comm=None, gamma=0.99, lam=0.95): policy = build_policy(env, network) model = Model(policy=policy, nbatch_act=1, nbatch_train=None, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) print('Model has been successfully loaded from {0}'.format(load_path)) else: print( 'No model has been loaded. Neural network with random weights is used.' ) # Instantiate the runner object and episode buffer runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, mvs=mvs) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( render=True) print('Demo completed! Reward: {0}'.format(epinfos[0]['r'])) print('\nPress Ctrl+C to stop the demo...')
def train_value(self, env, env_type, nupdates, minibatch_size=64): from baselines.ppo2.runner import Runner import baselines.ppo2.defaults as defaults if env_type == 'mujoco': params = defaults.mujoco() elif env_type == 'atari': params = defaults.atari() else: assert False runner = Runner(env=env, model=self.model, nsteps=params['nsteps'], gamma=params['gamma'], lam=params['lam']) for update in tqdm(range(1, nupdates + 1), dynamic_ncols=True): frac = 1.0 - (update - 1.0) / nupdates cliprangenow = params['cliprange'](frac) # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 length = len(obs) losses = [] for _ in range(params['noptepochs']): inds = np.random.permutation(length) for s in range(0, length, minibatch_size): mbinds = inds[s:s + minibatch_size] with self.graph.as_default(): loss, _ = self.sess.run( [self.vf_loss, self.value_update_op], feed_dict={ self.inp: obs[mbinds], self.R: returns[mbinds], self.OLDVPRED: values[mbinds], self.CLIPRANGE: cliprangenow }) losses.append(loss) tqdm.write(('loss: %f') % (np.mean(losses)))
class Model(object): def __init__(self, *, network, env, lr=3e-4, cliprange=0.2, nsteps=128, nminibatches=4, noptepochs=4, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, mpi_rank_weight=1, comm=None, microbatch_size=None, load_path=None, **network_kwargs): """ Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies.py env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) nminibatches: int number of training minibatches per update. For recurrent policies.py, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update ent_coef: float policy entropy coefficient in the optimization objective vf_coef: float value function loss coefficient in the optimization objective gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. """ self.sess = sess = get_session() if MPI is not None and comm is None: comm = MPI.COMM_WORLD policy = build_policy(env, network, **network_kwargs) self.env = env if isinstance(lr, float): self.lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): self.cliprange = constfn(cliprange) else: assert callable(cliprange) self.nminibatches = nminibatches # if eval_env is not None: # eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) # Calculate the batch_size self.nenvs = self.env.num_envs self.nsteps = nsteps self.nbatch = self.nenvs * self.nsteps self.nbatch_train = self.nbatch // nminibatches self.noptepochs = noptepochs with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(self.nenvs, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(self.nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder( [None]) # action placeholder self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # ratio 裁剪量 # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.def_path_pre = os.path.dirname( os.path.abspath(__file__)) + '/tmp/' initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables, comm=comm) # pylint: disable=E1101 if load_path is not None: self.load_newest(load_path) # Instantiate the runner object self.runner = Runner(env=self.env, model=self, nsteps=nsteps, gamma=gamma, lam=lam) def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advs = returns - values # Normalize the advantages advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { self.train_model.X: obs, self.A: actions, self.ADV: advs, self.R: returns, self.LR: lr, self.CLIPRANGE: cliprange, self.OLDNEGLOGPAC: neglogpacs, self.OLDVPRED: values } if states is not None: td_map[self.train_model.S] = states td_map[self.train_model.M] = masks return self.sess.run(self.stats_list + [self._train_op], td_map)[:-1] def learn(self, total_timesteps, seed=None, log_interval=10, save_interval=10): set_global_seeds(seed) total_timesteps = int(total_timesteps) # Calculate the batch_size is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) epinfobuf = deque(maxlen=100) # if eval_env is not None: # eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.perf_counter() for update in range(1, total_timesteps): assert self.nbatch % self.nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / total_timesteps # Calculate the learning rate lrnow = self.lr(frac) # Calculate the cliprange cliprangenow = self.cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = self.runner.run( ) # pylint: disable=E0632 # if eval_env is not None: # eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() # pylint: disable=E0632 if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) # if eval_env is not None: # eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(self.nbatch) for _ in range(self.noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, self.nbatch, self.nbatch_train): end = start + self.nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append( self.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert self.nenvs % self.nminibatches == 0 envsperbatch = self.nenvs // self.nminibatches envinds = np.arange(self.nenvs) flatinds = np.arange(self.nenvs * self.nsteps).reshape( self.nenvs, self.nsteps) for _ in range(self.noptepochs): np.random.shuffle(envinds) for start in range(0, self.nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( self.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(self.nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.record_tabular("misc/serial_timesteps", update * self.nsteps) logger.record_tabular("misc/nupdates", update) logger.record_tabular("misc/total_timesteps", update * self.nbatch) logger.record_tabular("fps", fps) logger.record_tabular("misc/explained_variance", float(ev)) logger.record_tabular( 'eprewmean', safe_mean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( 'eplenmean', safe_mean([epinfo['l'] for epinfo in epinfobuf])) # if eval_env is not None: # logger.record_tabular('eval_eprewmean', safe_mean([epinfo['r'] for epinfo in eval_epinfobuf])) # logger.record_tabular('eval_eplenmean', safe_mean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.record_tabular('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, self.loss_names): logger.record_tabular('loss/' + lossname, lossval) if is_mpi_root: logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and is_mpi_root: file_name = time.strftime('Y%YM%mD%d_h%Hm%Ms%S', time.localtime(time.time())) model_save_path = self.def_path_pre + file_name self.save(model_save_path) return self def save(self, save_path=None): save_variables(save_path=save_path, sess=self.sess) print('save model variables to', save_path) def load_newest(self, load_path=None): file_list = os.listdir(self.def_path_pre) file_list.sort( key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x))) if load_path is None: load_path = os.path.join(self.def_path_pre, file_list[-1]) load_variables(load_path=load_path, sess=self.sess) print('load_path: ', load_path) def load_index(self, index, load_path=None): file_list = os.listdir(self.def_path_pre) file_list.sort( key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)), reverse=True) if load_path is None: load_path = os.path.join(self.def_path_pre, file_list[index]) load_variables(load_path=load_path, sess=self.sess) print('load_path: ', load_path)
def learn(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) if init_fn is not None: init_fn() # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) return model
def learn(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) total_timesteps = int(total_timesteps) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space if isinstance(network, str): network_type = network policy_network_fn = get_network_builder(network_type)(**network_kwargs) policy_network = policy_network_fn(ob_space.shape) # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(ac_space=ac_space, policy_network=policy_network, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr) if load_path is not None: load_path = osp.expanduser(load_path) ckpt = tf.train.Checkpoint(model=model) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None) ckpt.restore(manager.latest_checkpoint) print("Restoring from {}".format(manager.latest_checkpoint)) print('after restore, all trainable weights {}'.format( model.train_model.policy_network.trainable_weights)) #model.load_weights(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate #lrnow = lr(frac) lrnow = lr # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (tf.constant(arr[mbinds]) for arr in (obs, returns, masks, actions, values, neglogpacs)) # slice_obs, slice_returns, slice_masks, slice_actions, slice_values, slice_neglogpacs = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) # slice_advs = slice_returns - slice_values # slice_advs = (slice_advs - slice_advs.mean()) / (slice_advs.std() + 1e-8) # slices = (tf.constant(slice_obs), tf.constant(slice_returns), tf.constant(slice_masks), # tf.constant(slice_advs), tf.constant(slice_actions), tf.constant(slice_values), tf.constant(slice_neglogpacs)) # print('slice actions {}'.format(slice_actions.dtype)) # print('-------------------------------------------') # print('inds {}'.format(inds)) # print('slice obs {}'.format(slice_obs)) # print('slice returns {}'.format(slice_returns)) # print('slice masks {}'.format(slice_masks)) # print('slice actions {}'.format(slice_actions)) # print('slice values {}'.format(slice_values)) # print('slice neglogpacs {}'.format(slice_neglogpacs)) # print('slice advs {}'.format(slice_advs)) pg_loss, vf_loss, entropy, approxkl, clipfrac, vpred, vpredclipped = model.train( lrnow, cliprange, *slices) # pg_loss, vf_loss, entropy, approxkl, clipfrac, vpred, vpredclipped = model.train( # cliprange, obs=slice_obs, returns=slice_returns, masks=slice_masks, advs=slice_advs, # actions=slice_actions, values=slice_values, neglogpac_old=slice_neglogpacs) # print('pg_loss {}'.format(pg_loss)) # print('vf_loss {}'.format(vf_loss)) # print('entropy {}'.format(entropy)) # print('approxkl {}'.format(approxkl)) # print('clipfrac {}'.format(clipfrac)) # print('vpred {}'.format(vpred)) # print('vpredclipped {}'.format(vpredclipped)) # print('pg_loss1 {}'.format(pg_loss1)) # print('pg_loss2 {}'.format(pg_loss2)) # train_model = model.train_model # params = train_model.policy_network.trainable_weights + train_model.value_fc.trainable_weights + train_model.pdtype.matching_fc.trainable_weights # for param in params: # print('param {} is {}'.format(param.name, param.numpy())) # print('-------------------------------------------') mblossvals.append([ pg_loss.numpy(), vf_loss.numpy(), entropy.numpy(), approxkl.numpy(), clipfrac.numpy() ]) # mblossvals.append([output for output.numpy() in model.train(cliprange, *slices)]) else: # recurrent version raise ValueError('Not Support Yet') # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() return model
def learn(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=1, noptepochs=4, cliprange=0.2, save_interval=1000, load_path=None, model_fn=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' nsteps = env.args.nsteps set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Get the nb of env nenvs = env.num_envs # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches models = [] policy = [] for agent_i in range(env.spec): policy.append(build_policy(env, network, **network_kwargs)) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy[agent_i], ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, agent_index=agent_i) if load_path is not None: model.load(load_path + ('checkpoints-%i/' % agent_i) + env.args.s_load_num) print('successfully load agent-%d' % agent_i) # Instantiate the runner object models.append(model) # ### runner = Runner(env=env, model_n=models, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env=eval_env, model_n=models, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) all_rewards_buf_0 = deque(maxlen=100) all_rewards_buf_1 = deque(maxlen=100) d_rewards_buf_0 = deque(maxlen=100) ce_rewards_buf_0 = deque(maxlen=100) c_rewards_buf_show_0 = deque(maxlen=100) c_rewards_buf_t_0 = deque(maxlen=100) c_rewards_buf_r_0 = deque(maxlen=100) c_rewards_buf_v_0 = deque(maxlen=100) ext_rewards_buf_v_0 = deque(maxlen=100) int_rewards_buf_v_0 = deque(maxlen=100) c_rewards_buf_tv_0 = deque(maxlen=100) ext_rewards_buf_tv_0 = deque(maxlen=100) int_rewards_buf_tv_0 = deque(maxlen=100) c_rewards_buf_all_0 = deque(maxlen=100) p_rewards_buf = deque(maxlen=100) d_rewards_buf_1 = deque(maxlen=100) ce_rewards_buf_1 = deque(maxlen=100) c_rewards_buf_show_1 = deque(maxlen=100) c_rewards_buf_t_1 = deque(maxlen=100) c_rewards_buf_r_1 = deque(maxlen=100) c_rewards_buf_v_1 = deque(maxlen=100) ext_rewards_buf_v_1 = deque(maxlen=100) int_rewards_buf_v_1 = deque(maxlen=100) c_rewards_buf_tv_1 = deque(maxlen=100) ext_rewards_buf_tv_1 = deque(maxlen=100) int_rewards_buf_tv_1 = deque(maxlen=100) c_rewards_buf_all_1 = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch print(total_timesteps) print(nupdates) for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch obs_n, returns_n, masks_n, actions_n, values_n, neglogpacs_n, e_returns_n, e_values_n, e_neglogpacs_n, \ c_returns_n, c_values_n, c_neglogpacs_n, \ states_n, e_states_n, c_states_n, epinfos, all_rewards, d_rewards, c_rewards_show, c_rewards_t, c_rewards_r, \ c_rewards_v, c_rewards_tv, c_rewards_all, p_rewards, ce_rewards, \ ext_rewards_tv_n, int_rewards_tv_n, ext_rewards_v_n, int_rewards_v_n = runner.run() # pylint: disable=E0632 if eval_env is not None: eval_obs_n, eval_returns_n, eval_masks_n, eval_actions_n, eval_values_n, eval_neglogpacs_n, eval_states_n, \ eval_epinfos = eval_runner.run() # pylint: disable=E0632 num_env = p_rewards.shape[1] epinfobuf.append(1. * np.sum(epinfos) / (np.sum(masks_n[0]) + num_env)) all_rewards_buf_0.append(1. * np.sum(all_rewards[:, 0]) / (np.sum(masks_n[0]) + num_env)) d_rewards_buf_0.append(1. * np.sum(d_rewards[:, 0]) / (np.sum(masks_n[0]) + num_env)) ce_rewards_buf_0.append(1. * np.sum(ce_rewards[:, 0]) / (np.sum(masks_n[0]) + num_env)) c_rewards_buf_show_0.append(1. * np.sum(c_rewards_show[:, 0]) / (np.sum(masks_n[0]) + num_env)) c_rewards_buf_r_0.append(1. * np.sum(c_rewards_r[:, 0]) / (np.sum(masks_n[0]) + num_env)) c_rewards_buf_v_0.append(1. * np.sum(c_rewards_v[:, 0]) / (np.sum(masks_n[0]) + num_env)) ext_rewards_buf_v_0.append(1. * np.sum(ext_rewards_v_n[:, 0]) / (np.sum(masks_n[0]) + num_env)) int_rewards_buf_v_0.append(1. * np.sum(int_rewards_v_n[:, 0]) / (np.sum(masks_n[0]) + num_env)) c_rewards_buf_t_0.append(1. * np.sum(c_rewards_t[:, 0]) / (np.sum(masks_n[0]) + num_env)) c_rewards_buf_tv_0.append(1. * np.sum(c_rewards_tv[:, 0]) / (np.sum(masks_n[0]) + num_env)) ext_rewards_buf_tv_0.append(1. * np.sum(ext_rewards_tv_n[:, 0]) / (np.sum(masks_n[0]) + num_env)) int_rewards_buf_tv_0.append(1. * np.sum(int_rewards_tv_n[:, 0]) / (np.sum(masks_n[0]) + num_env)) c_rewards_buf_all_0.append(1. * np.sum(c_rewards_all[:, 0]) / (np.sum(masks_n[0]) + num_env)) all_rewards_buf_1.append(1. * np.sum(all_rewards[:, 1]) / (np.sum(masks_n[0]) + num_env)) d_rewards_buf_1.append(1. * np.sum(d_rewards[:, 1]) / (np.sum(masks_n[0]) + num_env)) ce_rewards_buf_1.append(1. * np.sum(ce_rewards[:, 1]) / (np.sum(masks_n[0]) + num_env)) c_rewards_buf_show_1.append(1. * np.sum(c_rewards_show[:, 1]) / (np.sum(masks_n[0]) + num_env)) c_rewards_buf_r_1.append(1. * np.sum(c_rewards_r[:, 1]) / (np.sum(masks_n[0]) + num_env)) c_rewards_buf_v_1.append(1. * np.sum(c_rewards_v[:, 1]) / (np.sum(masks_n[0]) + num_env)) ext_rewards_buf_v_1.append(1. * np.sum(ext_rewards_v_n[:, 1]) / (np.sum(masks_n[0]) + num_env)) int_rewards_buf_v_1.append(1. * np.sum(int_rewards_v_n[:, 1]) / (np.sum(masks_n[0]) + num_env)) c_rewards_buf_t_1.append(1. * np.sum(c_rewards_t[:, 1]) / (np.sum(masks_n[0]) + num_env)) c_rewards_buf_tv_1.append(1. * np.sum(c_rewards_tv[:, 1]) / (np.sum(masks_n[0]) + num_env)) ext_rewards_buf_tv_1.append(1. * np.sum(ext_rewards_tv_n[:, 1]) / (np.sum(masks_n[0]) + num_env)) int_rewards_buf_tv_1.append(1. * np.sum(int_rewards_tv_n[:, 1]) / (np.sum(masks_n[0]) + num_env)) c_rewards_buf_all_1.append(1. * np.sum(c_rewards_all[:, 1]) / (np.sum(masks_n[0]) + num_env)) p_rewards_buf.append(-1. * np.sum(p_rewards) / (np.sum(masks_n[0]) + num_env)) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals_n = [[] for _ in range(env.spec)] mb_e_lossvals_n = [[] for _ in range(env.spec)] mb_c_lossvals_n = [[] for _ in range(env.spec)] if states_n[0] is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array for agent_i in range(env.spec): inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] # ########### TRAIN MODEL slices = (arr[mbinds] for arr in (obs_n[agent_i], returns_n[agent_i], masks_n[agent_i], actions_n[agent_i], values_n[agent_i], neglogpacs_n[agent_i])) mblossvals_n[agent_i].append(models[agent_i].train( lrnow, cliprangenow, *slices)) # ########## TRAIN E_MODEL e_slices = (arr[mbinds] for arr in (obs_n[agent_i], e_returns_n[agent_i], masks_n[agent_i], actions_n[agent_i], e_values_n[agent_i], e_neglogpacs_n[agent_i])) if env.args.s_alg_name == 'noisy' or env.args.s_alg_name == 'cen' or \ env.args.s_alg_name == 'dec': mb_e_lossvals_n[agent_i].append(0) else: mb_e_lossvals_n[agent_i].append( models[agent_i].e_train( lrnow, cliprangenow, *e_slices)) # ########## TRAIN C_MODEL c_slices = (arr[mbinds] for arr in (obs_n[agent_i], c_returns_n[agent_i], masks_n[agent_i], actions_n[agent_i], c_values_n[agent_i], c_neglogpacs_n[agent_i])) if env.args.s_alg_name == 'noisy' or env.args.s_alg_name == 'cen' or \ env.args.s_alg_name == 'dec': mb_c_lossvals_n[agent_i].append(0) else: mb_c_lossvals_n[agent_i].append( models[agent_i].c_train( lrnow, cliprangenow, *c_slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches for agent_i in range(env.spec): envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs_n[agent_i], returns_n[agent_i], masks_n[agent_i], actions_n[agent_i], values_n[agent_i], neglogpacs_n[agent_i])) mbstates = states_n[agent_i][mbenvinds] mblossvals_n[agent_i].append(models[agent_i].train( lrnow, cliprangenow, *slices, mbstates)) e_slices = (arr[mbflatinds] for arr in (obs_n[agent_i], e_returns_n[agent_i], masks_n[agent_i], actions_n[agent_i], e_values_n[agent_i], e_neglogpacs_n[agent_i])) e_mbstates = e_states_n[agent_i][mbenvinds] if env.args.s_alg_name == 'noisy' or env.args.s_alg_name == 'cen' or \ env.args.s_alg_name == 'dec': mb_e_lossvals_n[agent_i].append(0) else: mb_e_lossvals_n[agent_i].append( models[agent_i].e_train( lrnow, cliprangenow, *e_slices, e_mbstates)) c_slices = (arr[mbflatinds] for arr in (obs_n[agent_i], c_returns_n[agent_i], masks_n[agent_i], actions_n[agent_i], c_values_n[agent_i], c_neglogpacs_n[agent_i])) c_mbstates = c_states_n[agent_i][mbenvinds] if env.args.s_alg_name == 'noisy' or env.args.s_alg_name == 'cen' or \ env.args.s_alg_name == 'dec': mb_c_lossvals_n[agent_i].append(0) else: mb_c_lossvals_n[agent_i].append( models[agent_i].c_train( lrnow, cliprangenow, *c_slices, c_mbstates)) # Feedforward --> get losses --> update lossvals_n = [ np.mean(mblossvals_n[agent_i], axis=0) for agent_i in range(env.spec) ] e_lossvals_n = [ np.mean(mb_e_lossvals_n[agent_i], axis=0) for agent_i in range(env.spec) ] c_lossvals_n = [ np.mean(mb_c_lossvals_n[agent_i], axis=0) for agent_i in range(env.spec) ] # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv('time_elapsed', tnow - tfirststart) logger.logkv('eprewmean', safemean([epinfo for epinfo in epinfobuf])) logger.logkv( 'ep_all_rewmean_0', safemean([all_rewards for all_rewards in all_rewards_buf_0])) logger.logkv( 'ep_all_rewmean_1', safemean([all_rewards for all_rewards in all_rewards_buf_1])) logger.logkv( 'ep_dec_rewmean_0', safemean([all_rewards for all_rewards in d_rewards_buf_0])) logger.logkv( 'ep_cen_rewmean_0', safemean([all_rewards for all_rewards in ce_rewards_buf_0])) logger.logkv( 'ep_coor_rewmean_show_0', safemean([all_rewards for all_rewards in c_rewards_buf_show_0])) logger.logkv( 'ep_coor_rewmean_r_0', safemean([all_rewards for all_rewards in c_rewards_buf_r_0])) logger.logkv( 'ep_coor_rewmean_v_0', safemean([all_rewards for all_rewards in c_rewards_buf_v_0])) logger.logkv( 'ep_coor_rewmean_v_ext_0', safemean([all_rewards for all_rewards in ext_rewards_buf_v_0])) logger.logkv( 'ep_coor_rewmean_v_int_0', safemean([all_rewards for all_rewards in int_rewards_buf_v_0])) logger.logkv( 'ep_coor_rewmean_t_0', safemean([all_rewards for all_rewards in c_rewards_buf_t_0])) logger.logkv( 'ep_coor_rewmean_tv_0', safemean([all_rewards for all_rewards in c_rewards_buf_tv_0])) logger.logkv( 'ep_coor_rewmean_tv_ext_0', safemean([all_rewards for all_rewards in ext_rewards_buf_tv_0])) logger.logkv( 'ep_coor_rewmean_tv_int_0', safemean([all_rewards for all_rewards in int_rewards_buf_tv_0])) logger.logkv( 'ep_coor_rewmean_all_0', safemean([all_rewards for all_rewards in c_rewards_buf_all_0])) logger.logkv( 'ep_dec_rewmean_1', safemean([all_rewards for all_rewards in d_rewards_buf_1])) logger.logkv( 'ep_cen_rewmean_1', safemean([all_rewards for all_rewards in ce_rewards_buf_1])) logger.logkv( 'ep_coor_rewmean_show_1', safemean([all_rewards for all_rewards in c_rewards_buf_show_1])) logger.logkv( 'ep_coor_rewmean_r_1', safemean([all_rewards for all_rewards in c_rewards_buf_r_1])) logger.logkv( 'ep_coor_rewmean_v_1', safemean([all_rewards for all_rewards in c_rewards_buf_v_1])) logger.logkv( 'ep_coor_rewmean_v_ext_1', safemean([all_rewards for all_rewards in ext_rewards_buf_v_1])) logger.logkv( 'ep_coor_rewmean_v_int_1', safemean([all_rewards for all_rewards in int_rewards_buf_v_1])) logger.logkv( 'ep_coor_rewmean_t_1', safemean([all_rewards for all_rewards in c_rewards_buf_t_1])) logger.logkv( 'ep_coor_rewmean_tv_1', safemean([all_rewards for all_rewards in c_rewards_buf_tv_1])) logger.logkv( 'ep_coor_rewmean_tv_ext_1', safemean([all_rewards for all_rewards in ext_rewards_buf_tv_1])) logger.logkv( 'ep_coor_rewmean_tv_int_1', safemean([all_rewards for all_rewards in int_rewards_buf_tv_1])) logger.logkv( 'ep_coor_rewmean_all_1', safemean([all_rewards for all_rewards in c_rewards_buf_all_1])) logger.logkv( 'ep_penalty_rewmean', safemean([all_rewards for all_rewards in p_rewards_buf])) # logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) for agent_i in range(env.spec): ev = explained_variance(values_n[agent_i], returns_n[agent_i]) logger.logkv("explained_variance-%i" % agent_i, float(ev)) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) for (lossval, lossname) in zip(lossvals_n[agent_i], models[agent_i].loss_names): logger.logkv(lossname + ('-%i' % agent_i), lossval) if env.args.s_alg_name == 'noisy' or env.args.s_alg_name == 'cen' or \ env.args.s_alg_name == 'dec': pass else: for (lossval, lossname) in zip(e_lossvals_n[agent_i], models[agent_i].loss_names): logger.logkv(lossname + ('-e-%i' % agent_i), lossval) for (lossval, lossname) in zip(c_lossvals_n[agent_i], models[agent_i].loss_names): logger.logkv(lossname + ('-c-%i' % agent_i), lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and ( MPI is None or MPI.COMM_WORLD.Get_rank() == 0): for i_m, m in enumerate(models): checkdir = osp.join(logger.get_dir(), 'checkpoints-%i' % i_m) os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) m.save(savepath) return models
def test(network, test_env: VecEnv, n_steps: int = 2048, ent_coef: float = 0., vf_coef: float = .5, max_grad_norm: float = .5, gamma: float = .99, lmbda: float = .95, n_minibatches: int = 1, load_path: str = None, model_fn=None, mpi_rank_weight: int = 1, comm=None, **network_kwargs): # Load models policy = build_policy(test_env, network, **network_kwargs) # Get the nb of env nenvs = test_env.num_envs # Get state_space and action_space ob_space = test_env.observation_space ac_space = test_env.action_space # Calculate the batch_size nbatch = nenvs * n_steps nbatch_train = nbatch // n_minibatches # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model #model_fn = Model model_fn = ADRModel model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=n_steps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) model.load(load_path) runner = Runner(env=test_env, model=model, nsteps=n_steps, gamma=gamma, lam=lmbda) epinfobuf = deque(maxlen=100) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) epinfobuf.extend(epinfos) #get reward stats eprewmean = safemean([epinfo['r'] for epinfo in epinfobuf]) eprewstd = safestd([epinfo['r'] for epinfo in epinfobuf]) logger.logkv('eprewmean', eprewmean) logger.logkv('eprewstd', eprewstd) return eprewmean, eprewstd
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 total_timesteps = 1_000_000 ## now this counts steps in testing runs use_vf_clipping = True ## From random_ppo.py max_grad_norm = 0.5 vf_coef = 0.5 L2_WEIGHT = 10e-4 FM_COEFF = 0.002 REAL_THRES = 0.1 parser = argparse.ArgumentParser( description='Process procgen testing arguments.') parser.add_argument('--env_name', type=str, default='fruitbot') parser.add_argument( '--distribution_mode', type=str, default='easy', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=1000) ## default starting_level set to 50 to test on unseen levels! parser.add_argument('--start_level', type=int, default=1000) parser.add_argument('--run_id', '-id', type=int, default=0) parser.add_argument('--load_id', type=int, default=0) parser.add_argument('--nrollouts', '-nroll', type=int, default=0) args = parser.parse_args() args.total_timesteps = total_timesteps if args.nrollouts: total_timesteps = int(args.nrollouts * num_envs * nsteps) run_ID = 'run_' + str(args.run_id).zfill(2) run_ID += '_load{}'.format(args.load_id) comm = MPI.COMM_WORLD rank = comm.Get_rank() mpi_rank_weight = 0 num_levels = args.num_levels log_comm = comm.Split(0, 0) format_strs = ['csv', 'stdout', 'log'] if log_comm.Get_rank() == 0 else [] logpath = join(LOG_DIR, run_ID) if not os.path.exists(logpath): os.system("mkdir -p %s" % logpath) fpath = join(logpath, 'args_{}.json'.format(run_ID)) with open(fpath, 'w') as fh: json.dump(vars(args), fh, indent=4, sort_keys=True) print("\nSaved args at:\n\t{}\n".format(fpath)) logger.configure(dir=logpath, format_strs=format_strs) logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.compat.v1.Session(config=config) sess.__enter__() logger.info("Testing") ## Modified based on random_ppo.learn env = venv nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches nrollouts = total_timesteps // nbatch network = lambda x: build_impala_cnn(x, depths=[16, 32, 32], emb_size=256) policy = build_policy(env, network) model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) LOAD_PATH = "log/vanilla/saved_vanilla_v{}.tar".format(args.load_id) model.load(LOAD_PATH) logger.info("Model pramas loaded from save") runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) # tfirststart = time.time() ## Not doing timing yet # active_ep_buf = epinfobuf100 mean_rewards = [] datapoints = [] for rollout in range(1, nrollouts + 1): logger.info('collecting rollouts {}...'.format(rollout)) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) ## differnent from random_ppo! epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10]) rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100]) ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10]) ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100]) logger.info('\n----', rollout) mean_rewards.append(rew_mean_10) logger.logkv('eprew10', rew_mean_10) logger.logkv('eprew100', rew_mean_100) logger.logkv('eplenmean10', ep_len_mean_10) logger.logkv('eplenmean100', ep_len_mean_100) logger.logkv("misc/total_timesteps", rollout * nbatch) logger.info('----\n') logger.dumpkvs() env.close() print("Rewards history: ", mean_rewards) return mean_rewards
def learn(*, network, env, total_timesteps, early_stopping=False, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, scope='', **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' additional_params = network_kwargs["network_kwargs"] from baselines import logger # set_global_seeds(seed) We deal with seeds upstream if "LR_ANNEALING" in additional_params.keys(): lr_reduction_factor = additional_params["LR_ANNEALING"] start_lr = lr lr = lambda prop: (start_lr / lr_reduction_factor) + ( start_lr - (start_lr / lr_reduction_factor )) * prop # Anneals linearly from lr to lr/red factor if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) bestrew = 0 # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, scope=scope) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.perf_counter() best_rew_per_step = 0 run_info = defaultdict(list) nupdates = total_timesteps // nbatch print("TOT NUM UPDATES", nupdates) for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0, "Have {} total batch size and want {} minibatches, can't split evenly".format( nbatch, nminibatches) # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 eplenmean = safemean([epinfo['ep_length'] for epinfo in epinfos]) eprewmean = safemean([epinfo['r'] for epinfo in epinfos]) rew_per_step = eprewmean / eplenmean print("Curr learning rate {} \t Curr reward per step {}".format( lrnow, rew_per_step)) if rew_per_step > best_rew_per_step and early_stopping: # Avoid updating best model at first iteration because the means might be a bit off because # of how the multithreaded batch simulation works best_rew_per_step = eprewmean / eplenmean checkdir = osp.join(logger.get_dir(), 'checkpoints') model.save(checkdir + ".temp_best_model") print("Saved model as best", best_rew_per_step, "avg rew/step") epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in tqdm.trange(0, nbatch, nbatch_train, desc="{}/{}".format(_, noptepochs)): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) eprewmean = safemean([epinfo['r'] for epinfo in epinfobuf]) ep_dense_rew_mean = safemean( [epinfo['ep_shaped_r'] for epinfo in epinfobuf]) ep_sparse_rew_mean = safemean( [epinfo['ep_sparse_r'] for epinfo in epinfobuf]) eplenmean = safemean([epinfo['ep_length'] for epinfo in epinfobuf]) run_info['eprewmean'].append(eprewmean) run_info['ep_dense_rew_mean'].append(ep_dense_rew_mean) run_info['ep_sparse_rew_mean'].append(ep_sparse_rew_mean) run_info['eplenmean'].append(eplenmean) run_info['explained_variance'].append(float(ev)) logger.logkv( 'true_eprew', safemean([epinfo['ep_sparse_r'] for epinfo in epinfobuf])) logger.logkv('eprewmean', eprewmean) logger.logkv('eplenmean', eplenmean) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) time_elapsed = tnow - tfirststart logger.logkv('time_elapsed', time_elapsed) time_per_update = time_elapsed / update time_remaining = (nupdates - update) * time_per_update logger.logkv('time_remaining', time_remaining / 60) for (lossval, lossname) in zip(lossvals, model.loss_names): run_info[lossname].append(lossval) logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() # Update current logs if additional_params["RUN_TYPE"] in ["ppo", "joint_ppo"]: from overcooked_ai_py.utils import save_dict_to_file save_dict_to_file(run_info, additional_params["SAVE_DIR"] + "logs") # Linear annealing of reward shaping if additional_params["REW_SHAPING_HORIZON"] != 0: # Piecewise linear annealing schedule # annealing_thresh: until when we should stop doing 100% reward shaping # annealing_horizon: when we should reach doing 0% reward shaping annealing_horizon = additional_params[ "REW_SHAPING_HORIZON"] annealing_thresh = 0 def fn(x): if annealing_thresh != 0 and annealing_thresh - ( annealing_horizon / annealing_thresh) * x > 1: return 1 else: fn = lambda x: -1 * (x - annealing_thresh) * 1 / ( annealing_horizon - annealing_thresh) + 1 return max(fn(x), 0) curr_timestep = update * nbatch curr_reward_shaping = fn(curr_timestep) env.update_reward_shaping_param(curr_reward_shaping) print("Current reward shaping", curr_reward_shaping) sp_horizon = additional_params["SELF_PLAY_HORIZON"] # Save/overwrite best model if past a certain threshold if ep_sparse_rew_mean > bestrew and ep_sparse_rew_mean > additional_params[ "SAVE_BEST_THRESH"]: # Don't save best model if still doing some self play and it's supposed to be a BC model if additional_params[ "OTHER_AGENT_TYPE"][: 2] == "bc" and sp_horizon != 0 and env.self_play_randomization > 0: pass else: from human_aware_rl.ppo.ppo import save_ppo_model print("BEST REW", ep_sparse_rew_mean, "overwriting previous model with", bestrew) save_ppo_model( model, "{}seed{}/best".format( additional_params["SAVE_DIR"], additional_params["CURR_SEED"])) bestrew = max(ep_sparse_rew_mean, bestrew) # If not sp run, and horizon is not None, # vary amount of self play over time, either with a sigmoidal feedback loop # or with a fixed piecewise linear schedule. if additional_params[ "OTHER_AGENT_TYPE"] != "sp" and sp_horizon is not None: if type(sp_horizon) is not list: # Sigmoid self-play schedule based on current performance (not recommended) curr_reward = ep_sparse_rew_mean rew_target = sp_horizon shift = rew_target / 2 t = (1 / rew_target) * 10 fn = lambda x: -1 * (np.exp(t * (x - shift)) / (1 + np.exp(t * (x - shift)))) + 1 env.self_play_randomization = fn(curr_reward) print("Current self-play randomization", env.self_play_randomization) else: assert len(sp_horizon) == 2 # Piecewise linear self-play schedule # self_play_thresh: when we should stop doing 100% self-play # self_play_timeline: when we should reach doing 0% self-play self_play_thresh, self_play_timeline = sp_horizon def fn(x): if self_play_thresh != 0 and self_play_timeline - ( self_play_timeline / self_play_thresh) * x > 1: return 1 else: fn = lambda x: -1 * ( x - self_play_thresh) * 1 / ( self_play_timeline - self_play_thresh ) + 1 return max(fn(x), 0) curr_timestep = update * nbatch env.self_play_randomization = fn(curr_timestep) print("Current self-play randomization", env.self_play_randomization) if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and ( MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) # Visualization of rollouts with actual other agent run_type = additional_params["RUN_TYPE"] if run_type in ["ppo", "joint_ppo" ] and update % additional_params["VIZ_FREQUENCY"] == 0: from overcooked_ai_py.mdp.overcooked_env import OvercookedEnv from overcooked_ai_py.mdp.overcooked_mdp import OvercookedGridworld from overcooked_ai_py.agents.agent import AgentPair from overcooked_ai_py.agents.benchmarking import AgentEvaluator from human_aware_rl.baselines_utils import get_agent_from_model print(additional_params["SAVE_DIR"]) mdp = OvercookedGridworld.from_layout_name( **additional_params["mdp_params"]) overcooked_env = OvercookedEnv(mdp, **additional_params["env_params"]) agent = get_agent_from_model( model, additional_params["sim_threads"], is_joint_action=(run_type == "joint_ppo")) agent.set_mdp(mdp) if run_type == "ppo": if additional_params["OTHER_AGENT_TYPE"] == 'sp': agent_pair = AgentPair(agent, agent, allow_duplicate_agents=True) else: print("PPO agent on index 0:") env.other_agent.set_mdp(mdp) agent_pair = AgentPair(agent, env.other_agent) trajectory, time_taken, tot_rewards, tot_shaped_rewards = overcooked_env.run_agents( agent_pair, display=True, display_until=100) overcooked_env.reset() agent_pair.reset() print("tot rew", tot_rewards, "tot rew shaped", tot_shaped_rewards) print("PPO agent on index 1:") agent_pair = AgentPair(env.other_agent, agent) else: agent_pair = AgentPair(agent) trajectory, time_taken, tot_rewards, tot_shaped_rewards = overcooked_env.run_agents( agent_pair, display=True, display_until=100) overcooked_env.reset() agent_pair.reset() print("tot rew", tot_rewards, "tot rew shaped", tot_shaped_rewards) print(additional_params["SAVE_DIR"]) if nupdates > 0 and early_stopping: checkdir = osp.join(logger.get_dir(), 'checkpoints') print("Loaded best model", best_rew_per_step) model.load(checkdir + ".temp_best_model") return model, run_info
def main(): num_envs = 64 learning_rate = 5e-4 ent_coef = .01 ##new defined vf_coef = 0.5 max_grad_norm = 0.5 ########### gamma = .999 lam = .95 nsteps = 256 nminibatches = 8 ppo_epochs = 3 clip_range = .2 # timesteps_per_proc = 50_000_000 use_vf_clipping = True parser = argparse.ArgumentParser(description='Process procgen training arguments.') parser.add_argument('--env_name', type=str, default='coinrun') parser.add_argument('--distribution_mode', type=str, default='hard', choices=["easy", "hard", "exploration", "memory", "extreme"]) parser.add_argument('--num_levels', type=int, default=0) parser.add_argument('--start_level', type=int, default=0) parser.add_argument('--test_worker_interval', type=int, default=0) parser.add_argument('--total_timesteps', type=int, default=0) args = parser.parse_args() test_worker_interval = args.test_worker_interval comm = MPI.COMM_WORLD rank = comm.Get_rank() is_test_worker = False if test_worker_interval > 0: is_test_worker = comm.Get_rank() % test_worker_interval == (test_worker_interval - 1) mpi_rank_weight = 0 if is_test_worker else 1 num_levels = 0 if is_test_worker else args.num_levels log_comm = comm.Split(1 if is_test_worker else 0, 0) format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else [] logger.configure(dir=LOG_DIR, format_strs=format_strs, log_suffix="_total_timesteps_{}_num_levels_{}".format(args.total_timesteps, num_levels)) '''logger.info("creating environment") venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=args.start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False)''' logger.info("Creating dropout evaluation environment") eval_venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=100, start_level=2000, distribution_mode=args.distribution_mode) eval_venv = VecExtractDictObs(eval_venv, "rgb") eval_venv = VecMonitor( venv=eval_venv, filename=None, keep_buf=100, ) eval_venv = VecNormalize(venv=eval_venv, ob=False) logger.info("creating tf session") setup_mpi_gpus() config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 sess = tf.Session(config=config) sess.__enter__() conv_fn = lambda x: build_impala_cnn(x, is_train=False, depths=[16,32,32], emb_size=256) logger.info("testing dropout") policy = build_policy(eval_venv,conv_fn) nenvs = eval_venv.num_envs ob_space = eval_venv.observation_space ac_space = eval_venv.action_space nbatch = nenvs * nsteps nbatch_train = nbatch//nminibatches # Instantiate the model object (that creates act_model and train_model) from baselines.ppo2.model import Model model_fn = Model #modified from baseline ppo2 learn model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) model.load(MODEL_PATH) eval_runner = Runner(env=eval_venv, model=model, nsteps=nsteps, gamma=.999, lam=.95) eval_epinfobuf = deque(maxlen=100) nupdates = args.total_timesteps//nbatch log_interval = 1 for update in range(1, nupdates+1): #single upate to test eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() eval_epinfobuf.extend(eval_epinfos) if update % log_interval == 0 or update == 1: logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) ) logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) ) logger.logkv('misc/total_timesteps',update*nbatch) logger.dumpkvs() eval_venv.close()
def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps//nbatch for update in range(1, nupdates+1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632 epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update*nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) ) logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) ) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) print('Saving to', savepath) model.save(savepath) return model
def learn(*, policy, FLAGS, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, average_window_size=int(1e6), stop=True, scenario='gfootball.scenarios.1_vs_1_easy', curriculum=np.linspace(0, 0.95, 20), **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- policy: policy network (as returned by build_policy()) #<REMOVED >network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) # specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns # tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward # neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. # See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) basic_builder = importlib.import_module(scenario, package=None) def build_builder_with_difficulty(difficulty): def builder_with_difficulty(builder): basic_builder.build_scenario(builder) builder.config().right_team_difficulty = difficulty builder.config().left_team_difficulty = difficulty return builder_with_difficulty # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) def make_runner(difficulty): def create_single_football_env(iprocess): """Creates gfootball environment.""" env = football_env.create_environment( env_name=builder_with_difficulty(difficulty), stacked=('stacked' in FLAGS.state), rewards=FLAGS.reward_experiment, logdir=logger.get_dir(), write_goal_dumps=FLAGS.dump_scores and (iprocess == 0), write_full_episode_dumps=FLAGS.dump_full_episodes and (iprocess == 0), render=FLAGS.render and (iprocess == 0), dump_frequency=50 if FLAGS.render and iprocess == 0 else 0) env = monitor.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(iprocess))) return env env = football_env.create_environment(, stacked=False, logdir='/tmp/football', write_goal_dumps=True, write_full_episode_dumps=False, render=False) vec_env = SubprocVecEnv([ (lambda _i=i: create_single_football_env(_i)) for i in range(FLAGS.num_envs) ], context=None) return Runner(env=vec_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) # Instantiate the runner object runner = make_runner(difficulties[0]) difficulty_idx = 0 if eval_env is not None: eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam) eprews = [] epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) if init_fn is not None: init_fn() # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps//nbatch update = 0 while True: update += 1 assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632 for r1, r2 in zip(returns, [i['r'] for i in epinfos]): assert r1 == r2 # assuming returns[i] and epinfos[i]['r'] are the saem if update % log_interval == 0 and is_mpi_root: logger.info('Done.') eprews.extend(returns) epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update*nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) ) logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) ) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) print('Saving to', savepath) model.save(savepath) if difficulty_idx < len(difficulties)-1 and len(eprews) >= average_window_size and sum(eprews[-average_window_size:]) >= 0. difficulty_idx += 1 env = football_env.create_environment(env_name=builder_with_difficulty(difficulty), stacked=False, logdir='/tmp/football', write_goal_dumps=True, write_full_episode_dumps=False, render=False) runner = Runner(env=make_env(difficulties[difficulty_idx]), model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam) return model
def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, **network_kwargs): '''Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) Daniel: should be `T` in the paper. Atari defaults are 128 as in the paper. total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective Daniel: 0.5 by default but the PPO paper uses 1.0 for Atari games. max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update Daniel: 4 by default but the PPO paper uses 3 (for Atari games), etc. cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. Daniel: for `tf.clip_by_value(ratio, 1-cliprange, 1+cliprange)` save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) # Daniel: hacky within PPO2 solution for limiting action ranges. if 'limit_act_range' in network_kwargs: limit_act_range = network_kwargs['limit_act_range'] network_kwargs.pop('limit_act_range') else: limit_act_range = False policy = build_policy(env, network, limit_act_range=limit_act_range, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if load_path is not None: logger.info("\nInside ppo2, loading model from: {}".format(load_path)) model.load(load_path) # Daniel: debugging and sanity checks _variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) tf_util.display_var_info(_variables) # Instantiate the runner object (Daniel: calls `env.reset()` so can take a while for cloth) # Also, I'm going to assume that if total_timesteps=0 then we don't waste time creating this. if total_timesteps > 0: runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps//nbatch # Daniel: debugging and sanity checks logger.info("\nInside ppo2, before updates (`env.reset()` called before this)") logger.info(" nsteps: {}, each env in VecEnv does this many to get minibatch".format(nsteps)) logger.info(" nbatch: {}, i.e., nsteps * nenv, size of data from (get_minibatch)".format(nbatch)) logger.info(" nbatch_train: {}, batch size for actual gradient update within epoch".format(nbatch_train)) logger.info(" noptepochs: {}, number of epochs over collected minibatch for PPO updates".format(noptepochs)) logger.info(" nupdates: {}, number of (get_minibatch, update_net) cycles".format(nupdates)) logger.info(" our model_fn class: {}".format(model_fn)) logger.info("(end of debugging messages)\n") for update in range(1, nupdates+1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos, ep_all_infos = runner.run() #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos, eval_ep_all_infos = eval_runner.run() #pylint: disable=E0632 epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update*nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) ) logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) ) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) logger.info('Saving model checkpoint to: ', savepath) model.save(savepath) # ------------------------------------------------------------------ # Daniel: extra stuff for debugging PPO on cloth, actions and infos for each episode. logstd_vals = model.act_model.get_logstd_values() action_dir = osp.join(logger.get_dir(), 'actions') episode_dir = osp.join(logger.get_dir(), 'ep_all_infos') logstd_dir = osp.join(logger.get_dir(), 'logstd') os.makedirs(action_dir, exist_ok=True) os.makedirs(episode_dir, exist_ok=True) os.makedirs(logstd_dir, exist_ok=True) act_savepath = osp.join(action_dir, 'actions_%.5i.pkl'%update) epi_savepath = osp.join(episode_dir, 'infos_%.5i.pkl'%update) std_savepath = osp.join(logstd_dir, 'logstd_%.5i.pkl'%update) with open(act_savepath, 'wb') as fh: pickle.dump(actions, fh) with open(epi_savepath, 'wb') as fh: pickle.dump(ep_all_infos, fh) with open(std_savepath, 'wb') as fh: pickle.dump(logstd_vals, fh) # ------------------------------------------------------------------ return model
def learn(network, env, nsteps, total_timesteps, mvs, ckpt, seed=None, ent_coef=0.0, lr=1e-3, vf_coef=0.5, max_grad_norm=0.5, noptepochs=4, gamma=0.99, lam=0.95, log_interval=10, cliprange=0.2, save_interval=1, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, load_path=None, **network_kwargs): set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) # build policy policy = build_policy(env, network, **network_kwargs) # Calculate the batch_size nenvs = env.num_envs nminibatches = 1 nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn( policy=policy, nbatch_act=nenvs, nbatch_train=None, #nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) print('Model has been successfully loaded from {0}'.format(load_path)) else: try: lp = osp.join(logger.get_dir(), 'checkpoints/{0}'.format(ckpt)) model.load(lp) print('Model has been successfully loaded from {0}'.format(lp)) except Exception as e: print(e) # Instantiate the runner object and episode buffer runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, mvs=mvs) epinfobuf = deque(maxlen=log_interval * nenvs) best_reward = -np.inf if init_fn is not None: init_fn() # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # decreases from 1 to 0 lrnow = lr(frac) cliprangenow = cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] # print(states.shape, mbstates.shape) mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.perf_counter() fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("misc/fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('stats/eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('stats/eprewmin', np.min([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('stats/eprewmax', np.max([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('stats/eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('misc/' + lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, 'last') print('Saving to', savepath) model.save(savepath) if len(epinfobuf) == log_interval * nenvs and safemean( [epinfo['r'] for epinfo in epinfobuf]) > best_reward: savepath = osp.join(checkdir, 'best') print('Saving to', savepath) model.save(savepath) best_reward = safemean([epinfo['r'] for epinfo in epinfobuf]) model.sess.close() return model