def learn(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=8, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' print( "PPO2 is running ****************************************************************" ) set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) print("Type of Policy in ppo2.py {}".format((policy))) # Get the nb of env #nenvs = env.num_envs nenvs = 1 # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model print("Making model") model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) # Instantiate the runner object print( "Runner is initiated ===============================================") runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) print("Runner successfully got initiated -------------------------------") if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) print("Eval runner is called") epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) if init_fn is not None: init_fn() # Start total timer tfirststart = time.perf_counter() print("Number of timesteps {}".format(total_timesteps)) print("Number of batches {}".format(nbatch)) nupdates = total_timesteps // nbatch print("Number of updates {}".format(nupdates)) for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') print("Inside the for loop ----------------") # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step print("Number of batches is {} and number of nbatch train {}". format(nbatch, nbatch_train)) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) print("Trainig the policy") mblossvals.append(model.train(lrnow, cliprangenow, *slices)) print("Policy Trained") else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) return model
def learn(*, network, env, reward_giver, expert_dataset, g_step, d_step, d_stepsize=3e-4, total_timesteps, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, **network_kwargs): # from PPO learn set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # nenvs = env.num_envs nenvs = 1 ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, reward_giver=reward_giver) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) if init_fn is not None: init_fn() tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch # from TRPO MPI nworkers = MPI.COMM_WORLD.Get_size() ob = model.act_model.X ac = model.A d_adam = MpiAdam(reward_giver.get_trainable_variables()) def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out # from PPO for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) logger.log("Optimizing Policy...") for _ in range(g_step): if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) mblossvals = [] if states is None: inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append( model.train(lrnow, cliprangenow, *slices)) else: assert False # make sure we're not going here, so any bugs aren't from here assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.perf_counter() fps = int(nbatch / (tnow - tstart)) # TRPO MPI logger.log("Optimizing Disciminator...") logger.log(fmt_row(13, reward_giver.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(obs)) batch_size = len(obs) // d_step d_losses = [] for ob_batch, ac_batch in dataset.iterbatches( (obs, actions), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( "eval_eprewmean", safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( "eval_eplenmean", safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv("misc/time_elapsed", tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv("loss/" + lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print("Saving to", savepath) model.save(savepath) return model
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, save_path=None,load_path=None): if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) time_string = int(time.time()) if save_interval and save_path: import cloudpickle with open(osp.join(save_path, 'make_model_{}.pkl'.format(time_string)), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps//nbatch for update in range(1, nupdates+1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 epinfobuf.extend(epinfos) mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update*nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and save_path: checkdir = osp.join(save_path, 'checkpoints_{}'.format(time_string)) os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) print('Saving to', savepath) model.save(savepath) if save_path: checkdir = osp.join(save_path, 'checkpoints_{}'.format(time_string)) os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Final save to', savepath) model.save(savepath) env.close()
def learn(*, network, env, total_timesteps, per_mdp_optimal_policies='ppo2', seed=None, nsteps=2048, gamma=0.99, log_interval=10, save_interval=0, load_path=None, **network_kwargs): ''' Learn policy using Posterior Sampling Reinforcement Learning algorithm (TODO: link) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.time() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.time() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and ( MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) return model
def train(self, num_steps, player, replay_buffer, optimize_op, train_interval=1, target_interval=8192, batch_size=32, min_buffer_size=20000, tf_schedules=(), handle_ep=lambda steps, rew: None, timeout=None): """ Run an automated training loop. This is meant to provide a convenient way to run a standard training loop without any modifications. You may get more flexibility by writing your own training loop. Args: num_steps: the number of timesteps to run. player: the Player for gathering experience. replay_buffer: the ReplayBuffer for experience. optimize_op: a TF Op to optimize the model. train_interval: timesteps per training step. target_interval: number of timesteps between target network updates. batch_size: the size of experience mini-batches. min_buffer_size: minimum replay buffer size before training is performed. tf_schedules: a sequence of TFSchedules that are updated with the number of steps taken. handle_ep: called with information about every completed episode. timeout: if set, this is a number of seconds after which the training loop should exit. """ sess = self.online_net.session sess.run(self.update_target) steps_taken = 0 next_target_update = target_interval next_train_step = train_interval start_time = time.time() eprew_buf = deque(maxlen=100) eplen_buf = deque(maxlen=100) loss_buf = deque(maxlen=self.log_interval) n_updates = 0 if self.data_aug != 'no_aug' and self.mpi_rank_weight > 0: if self.data_aug == "cutout_color": self.aug_func = Cutout_Color(batch_size=batch_size) elif self.data_aug == "crop": self.aug_func = Rand_Crop(batch_size=batch_size, sess=sess) else: raise ValueError("Invalid value for argument data_aug.") while steps_taken < num_steps: if timeout is not None and time.time() - start_time > timeout: return transitions = player.play() for trans in transitions: if trans['is_last']: eprew_buf.append(trans['total_reward']) eplen_buf.append(trans['episode_step'] + 1) # handle_ep(trans['episode_step'] + 1, trans['total_reward']) replay_buffer.add_sample(trans) steps_taken += 1 for sched in tf_schedules: sched.add_time(sess, 1) if replay_buffer.size >= min_buffer_size and steps_taken >= next_train_step: next_train_step = steps_taken + train_interval batch = replay_buffer.sample(batch_size) feed_dict = self.feed_dict(batch) _, losses = sess.run((optimize_op, self.losses), feed_dict=feed_dict) # gather batch if self.mix_mode == 'mixreg': batch = [batch[i] for i in feed_dict[self.indices_ph]] replay_buffer.update_weights(batch, losses) loss_buf.append(np.mean(losses)) n_updates += 1 # logging if n_updates % self.log_interval == 0: logger.logkv('misc/is_test_work', self.mpi_rank_weight == 0) logger.logkv('eprewmean', np.mean(eprew_buf)) logger.logkv('eplenmean', np.mean(eplen_buf)) logger.logkv('loss', np.mean(loss_buf)) logger.logkv('misc/time_elapsed', time.time() - start_time) logger.logkv('misc/steps_taken', steps_taken) logger.dumpkvs() if steps_taken >= next_target_update: next_target_update = steps_taken + target_interval sess.run(self.update_target)
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, log_path = '', train=True): # logger.configure('/scratch/msy290/RL/aborg/retro_contest_agent/metalearner_for_expt/model/') logger.configure(log_path+'model/') if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches assert nbatch % nminibatches == 0 make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env=env, model=model, nsteps=nsteps, total_timesteps=total_timesteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) tfirststart = time.time() # Experience replay a la PPO-ER with L=2: https://arxiv.org/abs/1710.04423 use_experience_replay = False nupdates = total_timesteps//nbatch for update in range(1, nupdates+1): tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) if not use_experience_replay or update % 2 == 1: obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(update) #pylint: disable=E0632 else: obs2, returns2, masks2, actions2, values2, neglogpacs2, states, epinfos = runner.run(update) #pylint: disable=E0632 epinfobuf.extend(epinfos) mblossvals = [] if states is None: # nonrecurrent version if use_experience_replay and update != 1: inds = list(np.arange(nbatch * 2)) for _ in range(noptepochs): random.sample(inds, nbatch) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (np.concatenate((obs, obs2)), np.concatenate((returns, returns2)), np.concatenate((masks, masks2)), np.concatenate((actions, actions2)), np.concatenate((values, values2)), np.concatenate((neglogpacs, neglogpacs2)))) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: inds = np.arange(int(nbatch/obs.shape[1])) inds = np.tile(inds, ( obs.shape[1],1)) inds = disarrange(inds) for _ in range(noptepochs): for start in range(0, nsteps, int(nbatch_train/nenvs)): end = start + int(nbatch_train/nenvs) n_env = obs.shape[1] for j in range(n_env): mbinds = inds[j][start:end] slices = (arr[mbinds] for arr in (obs[:,j,:,:,:], returns[:,j], masks[:,j], actions[:,j], values[:,j], neglogpacs[:,j])) if train: mblossvals.append(model.train(j,lrnow, cliprangenow, *slices)) else: mblossvals.append(model.train(nenvs,lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 assert use_experience_replay == False envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) values = sf01(values) returns = sf01(returns) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update*nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) print('Saving to', savepath) model.save(savepath) model.filmObj.reinit() env.close()
def learn( *, network, env, eval_env, make_eval_env, env_id, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, sil_update=10, sil_value=0.01, sil_alpha=0.6, sil_beta=0.1, sil_loss=0.1, # MBL # For train mbl mbl_train_freq=5, # For eval num_eval_episodes=5, eval_freq=5, vis_eval=False, eval_targs=('mbmf', ), #eval_targs=('mf',), quant=2, # For mbl.step #num_samples=(1500,), num_samples=(1, ), horizon=(2, ), #horizon=(2,1), #num_elites=(10,), num_elites=(1, ), mbl_lamb=(1.0, ), mbl_gamma=0.99, #mbl_sh=1, # Number of step for stochastic sampling mbl_sh=10000, #vf_lookahead=-1, #use_max_vf=False, reset_per_step=(0, ), # For get_model num_fc=2, num_fwd_hidden=500, use_layer_norm=False, # For MBL num_warm_start=int(1e4), init_epochs=10, update_epochs=5, batch_size=512, update_with_validation=False, use_mean_elites=1, use_ent_adjust=0, adj_std_scale=0.5, # For data loading validation_set_path=None, # For data collect collect_val_data=False, # For traj collect traj_collect='mf', # For profile measure_time=True, eval_val_err=False, measure_rew=True, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' if not isinstance(num_samples, tuple): num_samples = (num_samples, ) if not isinstance(horizon, tuple): horizon = (horizon, ) if not isinstance(num_elites, tuple): num_elites = (num_elites, ) if not isinstance(mbl_lamb, tuple): mbl_lamb = (mbl_lamb, ) if not isinstance(reset_per_step, tuple): reset_per_step = (reset_per_step, ) if validation_set_path is None: if collect_val_data: validation_set_path = os.path.join(logger.get_dir(), 'val.pkl') else: validation_set_path = os.path.join('dataset', '{}-val.pkl'.format(env_id)) if eval_val_err: eval_val_err_path = os.path.join('dataset', '{}-combine-val.pkl'.format(env_id)) logger.log(locals()) logger.log('MBL_SH', mbl_sh) logger.log('Traj_collect', traj_collect) if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) np.set_printoptions(precision=3) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: model_fn = Model make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, sil_update=sil_update, fn_reward=None, fn_obs=None, sil_value=sil_value, sil_alpha=sil_alpha, sil_beta=sil_beta, sil_loss=sil_loss, comm=comm, mpi_rank_weight=mpi_rank_weight, ppo=True, prev_pi=None) model = make_model() pi = model.sil_model if load_path is not None: model.load(load_path) # MBL # --------------------------------------- #viz = Visdom(env=env_id) win = None eval_targs = list(eval_targs) logger.log(eval_targs) make_model_f = get_make_mlp_model(num_fc=num_fc, num_fwd_hidden=num_fwd_hidden, layer_norm=use_layer_norm) mbl = MBL(env=eval_env, env_id=env_id, make_model=make_model_f, num_warm_start=num_warm_start, init_epochs=init_epochs, update_epochs=update_epochs, batch_size=batch_size, **network_kwargs) val_dataset = {'ob': None, 'ac': None, 'ob_next': None} if update_with_validation: logger.log('Update with validation') val_dataset = load_val_data(validation_set_path) if eval_val_err: logger.log('Log val error') eval_val_dataset = load_val_data(eval_val_err_path) if collect_val_data: logger.log('Collect validation data') val_dataset_collect = [] def _mf_pi(ob, t=None): stochastic = True ac, vpred, _, _ = pi.step(ob, stochastic=stochastic) return ac, vpred def _mf_det_pi(ob, t=None): #ac, vpred, _, _ = pi.step(ob, stochastic=False) ac, vpred = pi._evaluate([pi.pd.mode(), pi.vf], ob) return ac, vpred def _mf_ent_pi(ob, t=None): mean, std, vpred = pi._evaluate([pi.pd.mode(), pi.pd.std, pi.vf], ob) ac = np.random.normal(mean, std * adj_std_scale, size=mean.shape) return ac, vpred ################### use_ent_adjust======> adj_std_scale????????pi action sample def _mbmf_inner_pi(ob, t=0): if use_ent_adjust: return _mf_ent_pi(ob) else: #return _mf_pi(ob) if t < mbl_sh: return _mf_pi(ob) else: return _mf_det_pi(ob) # --------------------------------------- # Run multiple configuration once all_eval_descs = [] def make_mbmf_pi(n, h, e, l): def _mbmf_pi(ob): ac, rew = mbl.step(ob=ob, pi=_mbmf_inner_pi, horizon=h, num_samples=n, num_elites=e, gamma=mbl_gamma, lamb=l, use_mean_elites=use_mean_elites) return ac[None], rew return Policy(step=_mbmf_pi, reset=None) for n in num_samples: for h in horizon: for l in mbl_lamb: for e in num_elites: if 'mbmf' in eval_targs: all_eval_descs.append(('MeanRew', 'MBL_PPO_SIL', make_mbmf_pi(n, h, e, l))) #if 'mbmf' in eval_targs: all_eval_descs.append(('MeanRew-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), 'MBL_TRPO-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), make_mbmf_pi(n, h, e, l))) if 'mf' in eval_targs: all_eval_descs.append( ('MeanRew', 'PPO_SIL', Policy(step=_mf_pi, reset=None))) logger.log('List of evaluation targets') for it in all_eval_descs: logger.log(it[0]) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield pool = Pool(mp.cpu_count()) warm_start_done = False U.initialize() if load_path is not None: pi.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=40) if init_fn is not None: init_fn() if traj_collect == 'mf': obs = runner.run()[0] # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer if hasattr(model.train_model, "ret_rms"): model.train_model.ret_rms.update(returns) if hasattr(model.train_model, "rms"): model.train_model.rms.update(obs) tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 # Val data collection if collect_val_data: for ob_, ac_, ob_next_ in zip(obs[:-1, 0, ...], actions[:-1, ...], obs[1:, 0, ...]): val_dataset_collect.append( (copy.copy(ob_), copy.copy(ac_), copy.copy(ob_next_))) # ----------------------------- # MBL update else: ob_mbl, ac_mbl = obs.copy(), actions.copy() mbl.add_data_batch(ob_mbl[:-1, ...], ac_mbl[:-1, ...], ob_mbl[1:, ...]) mbl.update_forward_dynamic(require_update=(update - 1) % mbl_train_freq == 0, ob_val=val_dataset['ob'], ac_val=val_dataset['ac'], ob_next_val=val_dataset['ob_next']) # ----------------------------- if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) l_loss, sil_adv, sil_samples, sil_nlogp = model.sil_train(lrnow) else: # recurrent version print("caole") assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv("AverageReturn", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) if sil_update > 0: logger.logkv("sil_samples", sil_samples) if rank == 0: # MBL evaluation if not collect_val_data: #set_global_seeds(seed) default_sess = tf.get_default_session() def multithread_eval_policy(env_, pi_, num_episodes_, vis_eval_, seed): with default_sess.as_default(): if hasattr(env, 'ob_rms') and hasattr( env_, 'ob_rms'): env_.ob_rms = env.ob_rms res = eval_policy(env_, pi_, num_episodes_, vis_eval_, seed, measure_time, measure_rew) try: env_.close() except: pass return res if mbl.is_warm_start_done() and update % eval_freq == 0: warm_start_done = mbl.is_warm_start_done() if num_eval_episodes > 0: targs_names = {} with timed('eval'): num_descs = len(all_eval_descs) list_field_names = [ e[0] for e in all_eval_descs ] list_legend_names = [ e[1] for e in all_eval_descs ] list_pis = [e[2] for e in all_eval_descs] list_eval_envs = [ make_eval_env() for _ in range(num_descs) ] list_seed = [seed for _ in range(num_descs)] list_num_eval_episodes = [ num_eval_episodes for _ in range(num_descs) ] print(list_field_names) print(list_legend_names) list_vis_eval = [ vis_eval for _ in range(num_descs) ] for i in range(num_descs): field_name, legend_name = list_field_names[ i], list_legend_names[i], res = multithread_eval_policy( list_eval_envs[i], list_pis[i], list_num_eval_episodes[i], list_vis_eval[i], seed) #eval_results = pool.starmap(multithread_eval_policy, zip(list_eval_envs, list_pis, list_num_eval_episodes, list_vis_eval,list_seed)) #for field_name, legend_name, res in zip(list_field_names, list_legend_names, eval_results): perf, elapsed_time, eval_rew = res logger.logkv(field_name, perf) if measure_time: logger.logkv('Time-%s' % (field_name), elapsed_time) if measure_rew: logger.logkv( 'SimRew-%s' % (field_name), eval_rew) targs_names[field_name] = legend_name if eval_val_err: fwd_dynamics_err = mbl.eval_forward_dynamic( obs=eval_val_dataset['ob'], acs=eval_val_dataset['ac'], obs_next=eval_val_dataset['ob_next']) logger.logkv('FwdValError', fwd_dynamics_err) #logger.dump_tabular() logger.dumpkvs() #print(logger.get_dir()) #print(targs_names) #if num_eval_episodes > 0: # win = plot(viz, win, logger.get_dir(), targs_names=targs_names, quant=quant, opt='best') #else: logger.dumpkvs() # ----------- yield pi if collect_val_data: with open(validation_set_path, 'wb') as f: pickle.dump(val_dataset_collect, f) logger.log('Save {} validation data'.format( len(val_dataset_collect))) if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) return model
def learn(*, policy, env, nsteps, total_episodes, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, keep_all_ckpt=False): # FIXME(cpacker): # Callable lr and cliprange don't work (at the moment) with the # total_episodes terminating condition if isinstance(lr, float): lr = constfn(lr) else: raise NotImplementedError assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: raise NotImplementedError assert callable(cliprange) # total_timesteps = int(total_timesteps) total_episodes = int(total_episodes) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) tfirststart = time.time() # nupdates = total_timesteps//nbatch # for update in range(1, nupdates+1): update = 0 episodes_so_far = 0 old_savepath = None while True: update += 1 if episodes_so_far > total_episodes: break assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() # frac = 1.0 - (update - 1.0) / nupdates frac = 1.0 - (update - 1.0) / total_episodes lrnow = lr(frac) cliprangenow = cliprange(frac) obs, returns, masks, actions, values, neglogpacs, states, epinfos, num_episodes = runner.run( ) #pylint: disable=E0632 # NOTE(cpacker): Is this the best/correct way to keep track of n_eps? #episodes_so_far += len(epinfos) episodes_so_far += num_episodes epinfobuf.extend(epinfos) mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_episodes", episodes_so_far) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) obs_norms = {} obs_norms['clipob'] = env.clipob obs_norms['mean'] = env.ob_rms.mean obs_norms['var'] = env.ob_rms.var + env.epsilon with open(osp.join(checkdir, 'normalize'), 'wb') as f: pickle.dump(obs_norms, f, pickle.HIGHEST_PROTOCOL) model.save(savepath) if not keep_all_ckpt and old_savepath: print('Removing previous checkpoint', old_savepath) os.remove(old_savepath) old_savepath = savepath env.close()
def log(self, rewards, dones): self.logs['ep_rew'] += rewards self.logs['dones'] = np.maximum(self.logs['dones'], dones) if sum(self.logs['dones']) < self.envs.num_envs: return self.logs['eps'] += self.envs.num_envs self.logs['rew_best'] = max(self.logs['rew_best'], np.mean(self.logs['ep_rew'])) elapsed_time = time.time() - self.logs['start_time'] frames = self.envs.num_envs * self.n_steps * self.logs['updates'] logger.logkv('fps', int(frames / elapsed_time)) logger.logkv('elapsed_time', int(elapsed_time)) logger.logkv('n_eps', self.logs['eps']) logger.logkv('n_samples', frames) logger.logkv('n_updates', self.logs['updates']) logger.logkv('rew_best_mean', self.logs['rew_best']) logger.logkv('rew_max', np.max(self.logs['ep_rew'])) logger.logkv('rew_mean', np.mean(self.logs['ep_rew'])) logger.logkv('rew_mestd', np.std(self.logs['ep_rew'])) # weird name to ensure it's above min since logger sorts logger.logkv('rew_min', np.min(self.logs['ep_rew'])) logger.dumpkvs() self.logs['dones'] = np.zeros(self.envs.num_envs) self.logs['ep_rew'] = np.zeros(self.envs.num_envs)
def learn(*, network, env, total_timesteps, nsteps=2048, lr=3e-4, vf_coef=0.5, gamma=0.99, lam=0.95, log_interval=1, save_interval=0, load_path=None, gradstepsperepoch=32, noptepochs=10, epsilon=0.4, replay_length=64, J_targ=0.001, epsilon_b=0.1, gaev = 1, eval_env = None, seed=None, **network_kwargs): ''' Dimension-Wise Importance Sampling Weight Clipping (DISC) parameters Parameters: ---------- network: multi-layer perceptrons (MLP) with 2 hidden layers of size 64 env: Mujoco environment eval_env: environment for the deterministic evaluation total_timesteps: int number of time steps nsteps (N): int size of a sample batch lr (beta): float function learning rate which reduces linearly as iterations goes on vf_coef: float value function loss coefficient gamma: float discounting factor lam (lambda) : float discounting factor for GAE log_interval: int number of time steps between logging events save_interval: int number of time steps between saving events load_path: str path to load the model from gradstepsperepoch: int number of training per epoch noptepochs: int number of training epochs per update epsilon : float clipping factor for dimension-wise clipping replay length (L) : int maximum number of sample batches stored in the replay buffer J_targ: float IS target constant epsilon_b : float batch inclusion factor gaev : int use GAE-V if gaev = 1, and use GAE otherwise ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(epsilon, float): epsilon = constfn(epsilon) else: assert callable(epsilon) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space obdim = ob_space.shape[0] acdim = ac_space.shape[0] print("Observation space dimension : " + str(obdim)) print("Action space dimension : " + str(acdim)) # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // gradstepsperepoch # Instantiate the model object (that creates act_model and train_model) make_model = lambda : Model(policy=policy, nbatch_act=nenvs, nsteps=nsteps, vf_coef=vf_coef) model = make_model() if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = EvalRunner(env = eval_env, model = model, nsteps = 10*nsteps, gamma = gamma, lam= lam) eval_runner.obfilt=runner.obfilt eval_runner.rewfilt=runner.rewfilt epinfobuf = deque(maxlen=10) # Start total timer tfirststart = time.time() nupdates = total_timesteps//nbatch def GAE(seg, gamma, value, lam): """ Compute target value using TD(lambda) estimator, and advantage with GAE """ done = np.append(seg["done"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1 T = len(seg["rew"]) gaelam = np.empty(T, 'float32') rew = runner.rewfilt(seg["rew"]) lastgaelam = 0 for t in reversed(range(T)): nonterminal = 1 - done[t + 1] delta = rew[t] + gamma * value[t + 1] * nonterminal - value[t] gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam ret = gaelam + value[:-1] return ret, gaelam def GAE_V(seg, gamma, value, rho): """ Compute target value using V-trace estimator, and advantage with GAE-V """ done = np.append(seg["done"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1 rho_ = np.append(rho, 1.0) r = np.minimum(1.0, rho_) T = len(seg["rew"]) gaelam = np.empty(T, 'float32') rew = runner.rewfilt(seg["rew"]) lastgaelam = 0 for t in reversed(range(T)): nonterminal = 1 - done[t + 1] delta = (rew[t] + gamma * value[t + 1] * nonterminal - value[t]) gaelam[t] = delta + gamma * lam * nonterminal * lastgaelam lastgaelam = r[t] * gaelam[t] ret = r[:-1]*gaelam + value[:-1] return ret, gaelam seg = None # Calculate the epsilon epsilonnow = epsilon(1.0) alpha_IS=1.0 for update in range(1, nupdates+1): assert nbatch % gradstepsperepoch == 0 # Start timer tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = np.maximum(1e-4, lr(frac)) if seg is None: prev_seg = seg seg = {} else: prev_seg = {} for i in seg: prev_seg[i] = np.copy(seg[i]) # Run a sample batch seg["ob"], seg["rew"], seg["done"], seg["ac"], seg["neglogp"], seg["mean"], seg["logstd"], final_obs, final_done, epinfos = runner.run() #pylint: disable=E0632 epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfos = eval_runner.run() # Stack the sample batches (the maximum length is L) if prev_seg is not None: for key in seg: if len(np.shape(seg[key])) == 1: seg[key] = np.hstack([prev_seg[key], seg[key]]) else: seg[key] = np.vstack([prev_seg[key], seg[key]]) if np.shape(seg[key])[0] > replay_length * nsteps: seg[key] = seg[key][-replay_length * nsteps:] # Compute all values of all samples in the buffer ob_stack = np.vstack([seg["ob"], final_obs]) values = model.values(runner.obfilt(ob_stack)) values[-1] = (1.0-final_done) * values[-1] ob = runner.obfilt(seg["ob"]) # Compute IS weight of all samples in the buffer mean_now, logstd_now = model.meanlogstds(ob) neglogpnow = 0.5 * np.sum(np.square((seg["ac"] - mean_now) / np.exp(logstd_now)), axis=-1) \ + 0.5 * np.log(2.0 * np.pi) * np.shape(seg["ac"])[1] \ + np.sum(logstd_now, axis=-1) rho = np.exp(-neglogpnow + seg["neglogp"]) # Estimate target values and advantages if gaev==1: ret, gae = GAE_V(seg, gamma, values, rho) else: ret, gae = GAE(seg, gamma, values, lam) # Select sample batches which satisfies batch limiting condition in the paper prior_prob = np.zeros(len(seg["ob"])) rho_dim = np.exp(- 0.5 * np.square((seg["ac"] - mean_now) / np.exp(logstd_now)) \ - logstd_now + 0.5 * np.square((seg["ac"] - seg["mean"]) / np.exp(seg["logstd"])) + seg["logstd"]) for i in range(int(len(prior_prob) / nsteps)): batch_condition = np.mean(np.abs(rho_dim[i * nsteps:(i + 1) * nsteps] - 1.0) + 1.0) if batch_condition > 1 + epsilon_b: prior_prob[i * nsteps:(i + 1) * nsteps] = 0 else: prior_prob[i * nsteps:(i + 1) * nsteps] = 1 # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] # Index of each element of batch_size # Create the indices array # On-policy data indices and minibatch size inds_on = np.arange(nsteps)+len(seg["ob"]) - nsteps nbatch_adapt_on = int((nsteps) / nsteps * nbatch_train) # Off-policy data indices and minibatch size inds_off = np.arange(len(seg["ob"]) - nsteps) nbatch_adapt_off = int((np.sum(prior_prob) - nsteps) / nsteps * nbatch_train) # On-policy data index on_policy_data = np.ones(len(seg["ob"])) * np.sum(prior_prob) / nsteps on_policy_data[:-nsteps]=0 for _ in range(noptepochs): losses_epoch = [] for _ in range(int(nsteps/nbatch_train)): # Choose sample minibatch indices of off policy trajectories if nbatch_adapt_off>0: idx_off = np.random.choice(inds_off, nbatch_adapt_off,p=prior_prob[:-nsteps]/np.sum(prior_prob[:-nsteps])) else: idx_off = [] # Choose sample minibatch indices of on policy trajectories idx_on = np.random.choice(inds_on, nbatch_adapt_on) all_idx = np.hstack([idx_off,idx_on]).astype(int) # Sample minibatch slices = (arr[all_idx] for arr in (ob, ret, gae, seg["ac"], values[:-1], seg["neglogp"], seg["mean"], seg["logstd"], on_policy_data, rho)) # Train the model loss_epoch = model.train(lrnow, epsilonnow, alpha_IS, *slices) mblossvals.append(loss_epoch) losses_epoch.append(loss_epoch) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # Update adaptive IS target constant print("IS loss avg :", lossvals[3]) if lossvals[3] > J_targ * 1.5: alpha_IS *= 2 print("Adaptive IS loss factor is increased") elif lossvals[3] < J_targ / 1.5: alpha_IS /= 2 print("Adaptive IS loss factor is reduced") alpha_IS = np.clip(alpha_IS,2**(-10),64) # End timer tnow = time.time() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: logger.logkv("adaptive IS loss factor", alpha_IS) logger.logkv("clipping factor", epsilonnow) logger.logkv("learning rate", lrnow) logger.logkv("serial_timesteps", update*nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfos])) logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfos])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) print('Saving to', savepath) model.save(savepath) return model
def set_steps_to_remain(self, steps_to_remain): if self.args.scheduler_type == "global": self.steps_to_remain = steps_to_remain self.last_timesteps_so_far = self.locals["timesteps_so_far"] logger.logkv("steps_to_remain", steps_to_remain) print("GLOBAL curriculum: ", steps_to_remain)
def learn(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2048, ent_coef=0.01, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=1, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, save_model_path=None, model_fn=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space if isinstance(network, str): network_type = network policy_network_fn = get_network_builder(network_type)(**network_kwargs) network = policy_network_fn(ob_space.shape) # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(ac_space=ac_space, policy_network=network, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if load_path is not None: load_path = osp.expanduser(load_path) ckpt = tf.train.Checkpoint(model=model) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None) ckpt.restore(manager.latest_checkpoint) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) # epinfobuf = deque(maxlen=100) epinfobuf_rewards = deque(maxlen=100) epinfobuf_len = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): # for update in range(0, nupdates+1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) cliprangenow = cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 # epinfobuf.extend(epinfos) epinfobuf_rewards.extend(epinfos['r']) epinfobuf_len.extend([len(epinfos['l'])]) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (tf.constant(arr[mbinds]) for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version raise ValueError('Not Support Yet') # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo for epinfo in epinfobuf_rewards])) logger.logkv('eplenmean', safemean([epinfo for epinfo in epinfobuf_len])) # logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) # logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) # logger.logkv('eprewmean', safemean(returns)) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) logger.dumpkvs() if save_model_path is not None: filepath = os.path.join(save_model_path, 'models') model.train_model.value_network.save_weights( '{}/world_model_value_net_{}_{}.h5'.format(filepath, seed)) model.train_model.policy_network.save_weights( '{}/world_model_policy_net_{}_{}.h5'.format( filepath, seed)) return model
def test_one_env(alt_flag, model, start_level, num_levels, logger, args, env=None): ## Modified based on random_ppo.learn if not env: venv = ProcgenEnv(num_envs=num_envs, env_name=args.env_name, num_levels=num_levels, start_level=start_level, distribution_mode=args.distribution_mode) venv = VecExtractDictObs(venv, "rgb") venv = VecMonitor( venv=venv, filename=None, keep_buf=100, ) venv = VecNormalize(venv=venv, ob=False) env = venv runner = TestRunner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) mean_rewards = [] datapoints = [] for rollout in range(1, args.nrollouts + 1): logger.info('collecting rollouts {}...'.format(rollout)) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( alt_flag) epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10]) rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100]) ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10]) ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100]) logger.info('\n----', rollout) mean_rewards.append(rew_mean_10) logger.logkv('start_level', start_level) logger.logkv('eprew10', rew_mean_10) logger.logkv('eprew100', rew_mean_100) logger.logkv('eplenmean10', ep_len_mean_10) logger.logkv('eplenmean100', ep_len_mean_100) logger.logkv("misc/total_timesteps", rollout * args.nbatch) logger.info('----\n') logger.dumpkvs() env.close() logger.info("Average reward on levels {} ~ {}: {} ".format( start_level, start_level + num_levels, mean_rewards)) return np.mean(mean_rewards)
def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps//nbatch for update in range(1, nupdates+1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632 epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update*nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) ) logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) ) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) print('Saving to', savepath) model.save(savepath) return model
def ppo(env, policy, val_fn=None, total_steps=TOTAL_STEPS_DEFAULT, steps=125, n_envs=16, gamma=0.99, gaelam=0.96, clip_ratio=0.2, pol_iters=80, val_iters=80, pol_lr=3e-4, val_lr=1e-3, target_kl=0.01, mb_size=100, **saver_kwargs): val_fn = val_fn or ValueFunction.from_policy(policy) logu.save_config(locals()) saver = SnapshotSaver(logger.get_dir(), locals(), **saver_kwargs) vec_env = VecEnvMaker(env)(n_envs) policy = policy.pop("class")(vec_env, **policy) val_fn = val_fn.pop("class")(vec_env, **val_fn) pol_optim = torch.optim.Adam(policy.parameters(), lr=pol_lr) val_optim = torch.optim.Adam(val_fn.parameters(), lr=val_lr) loss_fn = torch.nn.MSELoss() # Algorithm main loop collector = parallel_samples_collector(vec_env, policy, steps) beg, end, stp = steps * n_envs, total_steps + steps * n_envs, steps * n_envs for samples in trange(beg, end, stp, desc="Training", unit="step"): logger.info("Starting iteration {}".format(samples // stp)) logger.logkv("Iteration", samples // stp) logger.info("Start collecting samples") trajs = next(collector) logger.info("Computing policy gradient variables") compute_pg_vars(trajs, val_fn, gamma, gaelam) flatten_trajs(trajs) all_obs, all_acts, _, _, all_advs, all_vals, all_rets = trajs.values() all_obs, all_vals = all_obs[:-n_envs], all_vals[:-n_envs] logger.info("Minimizing surrogate loss") with torch.no_grad(): old_dists = policy(all_obs) old_logp = old_dists.log_prob(all_acts) min_advs = torch.where(all_advs > 0, (1 + clip_ratio) * all_advs, (1 - clip_ratio) * all_advs) dataset = TensorDataset(all_obs, all_acts, all_advs, min_advs, old_logp) dataloader = DataLoader(dataset, batch_size=mb_size, shuffle=True) for itr in range(pol_iters): for obs, acts, advs, min_adv, logp in dataloader: ratios = (policy(obs).log_prob(acts) - logp).exp() pol_optim.zero_grad() (-torch.min(ratios * advs, min_adv)).mean().backward() pol_optim.step() with torch.no_grad(): mean_kl = kl(old_dists, policy(all_obs)).mean().item() if mean_kl > 1.5 * target_kl: logger.info( "Stopped at step {} due to reaching max kl".format(itr + 1)) break logger.logkv("StopIter", itr + 1) logger.info("Updating val_fn") for _ in range(val_iters): val_optim.zero_grad() loss_fn(val_fn(all_obs), all_rets).backward() val_optim.step() logger.info("Logging information") logger.logkv("TotalNSamples", samples) logu.log_reward_statistics(vec_env) logu.log_val_fn_statistics(all_vals, all_rets) logu.log_action_distribution_statistics(old_dists) logger.logkv("MeanKL", mean_kl) logger.dumpkvs() logger.info("Saving snapshot") saver.save_state( index=samples // stp, state=dict( alg=dict(last_iter=samples // stp), policy=policy.state_dict(), val_fn=val_fn.state_dict(), pol_optim=pol_optim.state_dict(), val_optim=val_optim.state_dict(), ), ) vec_env.close()
def a2c_kfac(env, policy, val_fn=None, total_steps=TOTAL_STEPS_DEFAULT, steps=20, n_envs=16, kfac=None, ent_coeff=0.01, vf_loss_coeff=0.5, gamma=0.99, log_interval=100, warm_start=None, **saver_kwargs): assert val_fn is None or not issubclass( policy["class"], WeightSharingAC ), "Choose between a weight sharing model or separate policy and val_fn" # handle default values kfac = kfac or {} kfac = { "eps": 1e-3, "pi": True, "alpha": 0.95, "kl_clip": 1e-3, "eta": 1.0, **kfac } if val_fn is None and not issubclass(policy["class"], WeightSharingAC): val_fn = ValueFunction.from_policy(policy) # save config and setup state saving logu.save_config(locals()) saver = SnapshotSaver(logger.get_dir(), locals(), **saver_kwargs) # initialize models and optimizer vec_env = VecEnvMaker(env)(n_envs) policy = policy.pop("class")(vec_env, **policy) module_list = torch.nn.ModuleList(policy.modules()) if val_fn is not None: val_fn = val_fn.pop("class")(vec_env, **val_fn) module_list.extend(val_fn.modules()) optimizer = KFACOptimizer(module_list, **kfac) # scheduler = LinearLR(optimizer, total_steps // (steps*n_envs)) loss_fn = torch.nn.MSELoss() # load state if provided updates = 0 if warm_start is not None: if ":" in warm_start: warm_start, index = warm_start.split(":") else: index = None config, state = SnapshotSaver(warm_start, latest_only=False).get_state(int(index)) policy.load_state_dict(state["policy"]) if "optimizer" in state: optimizer.load_state_dict(state["optimizer"]) updates = state["alg"]["last_updt"] # Algorith main loop if val_fn is None: compute_dists_vals = policy else: def compute_dists_vals(obs): return policy(obs), val_fn(obs) ob_space, ac_space = vec_env.observation_space, vec_env.action_space obs = torch.from_numpy(vec_env.reset()) with torch.no_grad(): acts = policy.actions(obs) logger.info("Starting epoch {}".format(1)) beg, end, stp = steps * n_envs, total_steps + steps * n_envs, steps * n_envs total_updates = total_steps // stp for samples in trange(beg, end, stp, desc="Training", unit="step"): all_obs = torch.empty((steps, n_envs) + ob_space.shape, dtype=_NP_TO_PT[ob_space.dtype.type]) all_acts = torch.empty((steps, n_envs) + ac_space.shape, dtype=_NP_TO_PT[ac_space.dtype.type]) all_rews = torch.empty((steps, n_envs)) all_dones = torch.empty((steps, n_envs)) with torch.no_grad(): for i in range(steps): next_obs, rews, dones, _ = vec_env.step(acts.numpy()) all_obs[i] = obs all_acts[i] = acts all_rews[i] = torch.from_numpy(rews) all_dones[i] = torch.from_numpy(dones.astype("f")) obs = torch.from_numpy(next_obs) acts = policy.actions(obs) all_obs = all_obs.reshape(stp, -1).squeeze() all_acts = all_acts.reshape(stp, -1).squeeze() # Sample Fisher curvature matrix with optimizer.record_stats(): optimizer.zero_grad() all_dists, all_vals = compute_dists_vals(all_obs) logp = all_dists.log_prob(all_acts) noise = all_vals.detach() + 0.5 * torch.randn_like(all_vals) (logp.mean() + loss_fn(all_vals, noise)).backward(retain_graph=True) # Compute returns and advantages with torch.no_grad(): _, next_vals = compute_dists_vals(obs) all_rets = all_rews.clone() all_rets[-1] += gamma * (1 - all_dones[-1]) * next_vals for i in reversed(range(steps - 1)): all_rets[i] += gamma * (1 - all_dones[i]) * all_rets[i + 1] all_rets = all_rets.flatten() all_advs = all_rets - all_vals.detach() # Compute loss updates += 1 # ent_coeff = ent_coeff*0.99 if updates % 10 == 0 else ent_coeff pi_loss = -torch.mean(logp * all_advs) vf_loss = loss_fn(all_vals, all_rets) entropy = all_dists.entropy().mean() total_loss = pi_loss - ent_coeff * entropy + vf_loss_coeff * vf_loss # scheduler.step() optimizer.zero_grad() total_loss.backward() optimizer.step() if updates == 1 or updates % log_interval == 0: logger.logkv("Epoch", updates // log_interval + 1) logger.logkv("TotalNSamples", samples) logu.log_reward_statistics(vec_env) logu.log_val_fn_statistics(all_vals, all_rets) logu.log_action_distribution_statistics(all_dists) logger.dumpkvs() logger.info("Starting epoch {}".format(updates // log_interval + 2)) saver.save_state( index=updates, state=dict( alg=dict(last_updt=updates), policy=policy.state_dict(), val_fn=None if val_fn is None else val_fn.state_dict(), optimizer=optimizer.state_dict(), ), ) vec_env.close()
def learn(*, network, env, total_timesteps, early_stopping=False, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, scope='', **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' additional_params = network_kwargs["network_kwargs"] from baselines import logger # set_global_seeds(seed) We deal with seeds upstream if "LR_ANNEALING" in additional_params.keys(): lr_reduction_factor = additional_params["LR_ANNEALING"] start_lr = lr lr = lambda prop: (start_lr / lr_reduction_factor) + ( start_lr - (start_lr / lr_reduction_factor )) * prop # Anneals linearly from lr to lr/red factor if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) bestrew = 0 # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, scope=scope) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.perf_counter() best_rew_per_step = 0 run_info = defaultdict(list) nupdates = total_timesteps // nbatch print("TOT NUM UPDATES", nupdates) for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0, "Have {} total batch size and want {} minibatches, can't split evenly".format( nbatch, nminibatches) # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 eplenmean = safemean([epinfo['l'] for epinfo in epinfos]) eprewmean = safemean([epinfo['r'] for epinfo in epinfos]) rew_per_step = eprewmean / eplenmean print("Curr learning rate {} \t Curr reward per step {}".format( lrnow, rew_per_step)) if rew_per_step > best_rew_per_step and early_stopping: # Avoid updating best model at first iteration because the means might be a bit off because # of how the multithreaded batch simulation works best_rew_per_step = eprewmean / eplenmean checkdir = osp.join(logger.get_dir(), 'checkpoints') model.save(checkdir + ".temp_best_model") print("Saved model as best", best_rew_per_step, "avg rew/step") epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in tqdm.trange(0, nbatch, nbatch_train, desc="{}/{}".format(_, noptepochs)): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) eprewmean = safemean([epinfo['r'] for epinfo in epinfobuf]) ep_dense_rew_mean = safemean( [epinfo['dense_r'] for epinfo in epinfobuf]) ep_sparse_rew_mean = safemean( [epinfo['sparse_r'] for epinfo in epinfobuf]) eplenmean = safemean([epinfo['l'] for epinfo in epinfobuf]) run_info['eprewmean'].append(eprewmean) run_info['ep_dense_rew_mean'].append(ep_dense_rew_mean) run_info['ep_sparse_rew_mean'].append(ep_sparse_rew_mean) run_info['eplenmean'].append(eplenmean) run_info['explained_variance'].append(float(ev)) logger.logkv( 'true_eprew', safemean([epinfo['sparse_r'] for epinfo in epinfobuf])) logger.logkv('eprewmean', eprewmean) logger.logkv('eplenmean', eplenmean) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf])) time_elapsed = tnow - tfirststart logger.logkv('time_elapsed', time_elapsed) time_per_update = time_elapsed / update time_remaining = (nupdates - update) * time_per_update logger.logkv('time_remaining', time_remaining / 60) for (lossval, lossname) in zip(lossvals, model.loss_names): run_info[lossname].append(lossval) logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() # Update current logs if additional_params["RUN_TYPE"] in ["ppo", "joint_ppo"]: from hr_coordination.utils import save_dict_to_file save_dict_to_file(run_info, additional_params["SAVE_DIR"] + "logs") # Linear annealing of reward shaping if additional_params["REW_SHAPING_HORIZON"] != 0: # Piecewise linear annealing schedule # annealing_thresh: until when we should stop doing 100% reward shaping # annealing_horizon: when we should reach doing 0% reward shaping annealing_horizon = additional_params[ "REW_SHAPING_HORIZON"] annealing_thresh = 0 def fn(x): if annealing_thresh != 0 and annealing_thresh - ( annealing_horizon / annealing_thresh) * x > 1: return 1 else: fn = lambda x: -1 * (x - annealing_thresh) * 1 / ( annealing_horizon - annealing_thresh) + 1 return max(fn(x), 0) curr_timestep = update * nbatch curr_reward_shaping = fn(curr_timestep) env.update_reward_shaping_param(curr_reward_shaping) print("Current reward shaping", curr_reward_shaping) # Save/overwrite best model if past a certain threshold if ep_sparse_rew_mean > bestrew and ep_sparse_rew_mean > additional_params[ "SAVE_BEST_THRESH"]: # Don't save best model if still doing some self play and it's supposed to be a BC model if additional_params[ "OTHER_AGENT_TYPE"][:2] == "bc" and additional_params[ "SELF_PLAY_RND_GOAL"] != 0 and env.self_play_randomization > 0: pass from hr_coordination.ppo.ppo import save_ppo_model print("BEST REW", ep_sparse_rew_mean, "overwriting previous model with", bestrew) save_ppo_model( model, "{}seed{}/best".format(additional_params["SAVE_DIR"], additional_params["CURR_SEED"])) bestrew = max(ep_sparse_rew_mean, bestrew) if additional_params["SELF_PLAY_RND_GOAL"] != 0: if type(additional_params["SELF_PLAY_RND_GOAL"] ) is not list: # Sigmoid self-play schedule based on current performance (not recommended) curr_reward = ep_sparse_rew_mean rew_target = additional_params["SELF_PLAY_RND_GOAL"] shift = rew_target / 2 t = (1 / rew_target) * 10 fn = lambda x: -1 * (np.exp(t * (x - shift)) / (1 + np.exp(t * (x - shift)))) + 1 env.self_play_randomization = fn(curr_reward) print("Current self-play randomization", env.self_play_randomization) else: # Piecewise linear self-play schedule # self_play_thresh: when we should stop doing 100% self-play # self_play_timeline: when we should reach doing 0% self-play self_play_thresh, self_play_timeline = additional_params[ "SELF_PLAY_RND_GOAL"] def fn(x): if self_play_thresh != 0 and self_play_timeline - ( self_play_timeline / self_play_thresh) * x > 1: return 1 else: fn = lambda x: -1 * ( x - self_play_thresh) * 1 / ( self_play_timeline - self_play_thresh ) + 1 return max(fn(x), 0) curr_timestep = update * nbatch env.self_play_randomization = fn(curr_timestep) print("Current self-play randomization", env.self_play_randomization) if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and ( MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) # Visualization of rollouts with actual other agent run_type = additional_params["RUN_TYPE"] if run_type in ["ppo", "joint_ppo" ] and update % additional_params["VIZ_FREQUENCY"] == 0: from hr_coordination.agents.agent import AgentPair from hr_coordination.agents.benchmarking import AgentEvaluator from hr_coordination.pbt.pbt_utils import setup_mdp_env, get_agent_from_model print(additional_params["SAVE_DIR"]) overcooked_env = setup_mdp_env(display=False, **additional_params) agent = get_agent_from_model( model, additional_params["SIM_THREADS"], is_joint_action=(run_type == "joint_ppo")) agent.set_mdp(overcooked_env.mdp) if run_type == "ppo": if additional_params["OTHER_AGENT_TYPE"] == 'sp': agent_pair = AgentPair(agent, agent) else: print("PPO agent on index 0:") env.other_agent.set_mdp(overcooked_env.mdp) agent_pair = AgentPair(agent, env.other_agent) trajectory, time_taken, tot_rewards, tot_shaped_rewards = overcooked_env.run_agents( agent_pair, display=True, displayUntil=100) print("tot rew", tot_rewards, "tot rew shaped", tot_shaped_rewards) print("PPO agent on index 1:") agent_pair = AgentPair(env.other_agent, agent) else: agent_pair = AgentPair(agent) trajectory, time_taken, tot_rewards, tot_shaped_rewards = overcooked_env.run_agents( agent_pair, display=True, displayUntil=100) print("tot rew", tot_rewards, "tot rew shaped", tot_shaped_rewards) print(additional_params["SAVE_DIR"]) if nupdates > 0 and early_stopping: checkdir = osp.join(logger.get_dir(), 'checkpoints') print("Loaded best model", best_rew_per_step) model.load(checkdir + ".temp_best_model") return model, run_info
def learn(*, policy, env, raw_env, use_2D_env=True, use_other_room=False, use_rich_reward=False, use_multiple_starts=False, use_feedback=True, use_real_feedback=False, only_use_hr_until=1000, trans_to_rl_in=1000, nsteps=8, total_timesteps=1000, ppo_lr=2e-4, cliprange=0.2, ent_coef=.1, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, ppo_noptepochs=4, ppo_batch_size=32, ppo_minibatch_size=8, init_rl_importance=0.2, feedback_lr=1e-3, min_feedback_buffer_size=32, feedback_noptepochs=4, feedback_batch_size=16, feedback_minibatch_size=8, feedback_training_prop=0.7, feedback_training_new_prop=0.4, feedback_use_mixup=False, hf_loss_type="CCE", hf_loss_param=None, good_feedback_acc=0.7, bad_feedback_acc=0.7, log_interval=10, save_interval=0, reload_name=None, base_path=None): if isinstance(ppo_lr, float): ppo_lr = constfn(ppo_lr) else: assert callable(ppo_lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) assert ppo_batch_size % nsteps == 0 ob_space = env.observation_space ac_space = env.action_space nenvs = 1 nbatch = nenvs * nsteps if hf_loss_type == 0: hf_loss_param = None make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=ppo_minibatch_size, nbatch_feedback=feedback_minibatch_size, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, hf_loss_type=hf_loss_type, hf_loss_param=hf_loss_param) if save_interval and logger.get_dir(): import cloudpickle if not base_path: base_path = os.path.dirname(os.path.abspath(__file__)) if not os.path.isdir(osp.join(base_path, "models")): os.mkdir(osp.join(base_path, "models")) with open(osp.join(base_path, "models", 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) if use_real_feedback: print("looking for an EEG_Pred stream...", end="", flush=True) feedback_LSL_stream = pylsl.StreamInlet(pylsl.resolve_stream('type', 'EEG_Pred')[0]) print(" done") model = make_model() if reload_name: model.load(reload_name) target_position = raw_env.robot.get_target_position() if use_2D_env: judge_action, *_ = run_dijkstra(raw_env, target_position, use_other_room=use_other_room) else: judge_action = judge_action_1D(raw_env, target_position) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, judge_action=judge_action, use_rich_reward=use_rich_reward, use_multiple_starts=use_multiple_starts, use_feedback=use_feedback, use_real_feedback=use_real_feedback, only_use_hr_until=only_use_hr_until, trans_to_rl_in=trans_to_rl_in, init_rl_importance=init_rl_importance) epinfobuf = deque(maxlen=100) nupdates = total_timesteps // nbatch state_action_buffer = deque(maxlen=100) action_idx_buffer = deque(maxlen=100) feedback_buffer_train = {} feedback_buffer_train_true = {} feedback_buffer_valid = {} feedback_bmms = {} for a in range(ac_space.n): feedback_buffer_train[a], feedback_buffer_train_true[a], feedback_buffer_valid[a] = [], [], [] feedback_bmms[a] = 0 performance = {"feedback": [], "sparse_reward": [], "rich_reward": [], "train_acc": [], "train_true_acc": [], "valid_acc": []} epi_test_num = [0 for _ in range(ac_space.n)] ppo_obs, ppo_rewards, ppo_masks, ppo_actions, ppo_values, ppo_neglogpacs = [], [], [], [], [], [] for update in range(1, nupdates + 1): tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates ppo_lrnow = ppo_lr(frac) cliprangenow = cliprange(frac) obs, rewards, masks, actions, values, neglogpacs, cors, sparse_rew, rich_rew, _, action_idxs, epinfos = runner.run() epinfobuf.extend(epinfos) performance["sparse_reward"].extend(sparse_rew) performance["rich_reward"].extend(rich_rew) mblossvals = [] state_action_buffer.extend([[s, a] for s, a in zip(obs, actions)]) action_idx_buffer.extend(action_idxs) if use_feedback: if use_real_feedback: action_idxs, feedbacks, correct_feedbacks = get_feedback_from_LSL(feedback_LSL_stream) print("Received feedback from LSL", feedbacks) else: action_idxs, feedbacks, correct_feedbacks = \ get_simulated_feedback(cors if use_2D_env else obs, actions, action_idxs, judge_action, good_feedback_acc, bad_feedback_acc) performance["feedback"].extend(correct_feedbacks) # add feedbacks into feedback replay buffer if len(feedbacks): for a_idx, fb, cfb in zip(action_idxs, feedbacks, correct_feedbacks): s, a = state_action_buffer[action_idx_buffer.index(a_idx)] epi_test_num[a] += 1 - feedback_training_prop # s, fb, cfb = np.ones(13), 1, 1 if epi_test_num[a] > 1: feedback_buffer_valid[a].append([s, cfb]) epi_test_num[a] -= 1 else: feedback_buffer_train[a].append([s, fb]) feedback_buffer_train_true[a].append([s, cfb]) # train PPO if runner.num_step >= only_use_hr_until: ppo_obs.extend(obs) ppo_rewards.extend(rewards) ppo_masks.extend(masks) ppo_actions.extend(actions) ppo_values.extend(values) ppo_neglogpacs.extend(neglogpacs) if len(ppo_obs) == ppo_batch_size: ppo_obs = np.asarray(ppo_obs) ppo_rewards = np.asarray(ppo_rewards) ppo_masks = np.asarray(ppo_masks) ppo_actions = np.asarray(ppo_actions) ppo_values = np.asarray(ppo_values) ppo_neglogpacs = np.asarray(ppo_neglogpacs) ppo_returns = runner.calculate_returns(ppo_rewards, ppo_masks, ppo_values) inds = np.arange(ppo_batch_size) for _ in range(ppo_noptepochs): np.random.shuffle(inds) for start in range(0, ppo_batch_size, ppo_minibatch_size): end = start + ppo_minibatch_size mbinds = inds[start:end] slices = (arr[mbinds] for arr in (ppo_obs, ppo_returns, ppo_masks, ppo_actions, ppo_values, ppo_neglogpacs)) mblossvals.append(model.train(ppo_lrnow, cliprangenow, *slices)) ppo_obs, ppo_rewards, ppo_masks, ppo_actions, ppo_values, ppo_neglogpacs = [], [], [], [], [], [] # train feedback regressor if use_feedback and runner.num_step <= only_use_hr_until: all_train_acc = [] all_train_true_acc = [] all_valid_acc = [] if not all([len(feedback_buffer) >= min_feedback_buffer_size for feedback_buffer in feedback_buffer_train.values()]): performance["train_acc"].append(0.) performance["train_true_acc"].append(0.) performance["valid_acc"].append(0.) continue for a in range(ac_space.n): feedback_buffer = feedback_buffer_train[a] feedback_buffer_t = feedback_buffer_train_true[a] feedback_buffer_v = feedback_buffer_valid[a] bmm_model = feedback_bmms[a] for i in range(feedback_noptepochs): # print(len(feedback_buffer)) # print(feedback_buffer[:3]) if i < feedback_noptepochs * feedback_training_new_prop: inds = np.arange(len(feedback_buffer) - feedback_batch_size, len(feedback_buffer)) else: inds = np.random.choice(len(feedback_buffer), feedback_batch_size, replace=False) np.random.shuffle(inds) for start in range(0, feedback_batch_size, feedback_minibatch_size): end = start + feedback_minibatch_size obs = np.asarray([feedback_buffer[idx][0] for idx in inds[start:end]]) feedbacks = np.asarray([feedback_buffer[idx][1] for idx in inds[start:end]]) actions = np.asarray([a] * feedback_minibatch_size) if "bmm" in hf_loss_type: prop1, prop2 = hf_loss_param use_bootstrap = update * nsteps > only_use_hr_until * prop1 tmp = 1 - (1 - 0.001) * (update * nsteps - use_bootstrap) / (only_use_hr_until * prop2 - use_bootstrap) tmp = min(tmp, 0.001) pred, loss, _ = \ model.feedback_train_bootstrap(feedback_lr, obs, actions, feedbacks, bmm_model, use_bootstrap, tmp) else: pred, loss, _ = model.feedback_train(feedback_lr, obs, actions, feedbacks) # print('action: {} feedback: {} pred: {} loss: {}'.format(actions, feedbacks, pred, loss)) evaluate_start = time.time() obs_train = np.array([ele[0] for ele in feedback_buffer]) feedbacks_train = np.array([ele[1] for ele in feedback_buffer]) actions_train = np.array([a] * len(feedback_buffer)) obs_valid = np.array([ele[0] for ele in feedback_buffer_v]) feedbacks_valid = np.array([ele[1] for ele in feedback_buffer_v]) actions_valid = np.array([a] * len(feedback_buffer_v)) obs_train_true = np.array([ele[0] for ele in feedback_buffer_t]) feedbacks_train_true = np.array([ele[1] for ele in feedback_buffer_t]) actions_train_true = np.array([a] * len(feedback_buffer_t)) train_acc, train_loss = model.feedback_evaluate(obs_train, actions_train, feedbacks_train) valid_acc, _ = model.feedback_evaluate(obs_valid, actions_valid, feedbacks_valid) train_true_acc, _ = model.feedback_evaluate(obs_train_true, feedbacks_train_true, actions_train_true) feedback_bmms[a] = train_bmm_model(train_loss, a, update, base_path, feedbacks_train == feedbacks_train_true, good_feedback_acc) all_train_acc = np.concatenate([all_train_acc, train_acc]) all_valid_acc = np.concatenate([all_valid_acc, valid_acc]) all_train_true_acc = np.concatenate([all_train_true_acc, train_true_acc]) # print("evaluation takes ", time.time() - evaluate_start) all_train_acc, all_train_true_acc, all_valid_acc = \ np.mean(all_train_acc), np.mean(all_train_true_acc), np.mean(all_valid_acc) print("train acc {:>4.2f}; train true acc {:>4.2f}; valid acc {:>4.2f}".format( all_train_acc, all_train_true_acc, all_valid_acc)) performance["train_acc"].append(all_train_acc if math.isfinite(all_train_acc) else 0.) performance["train_true_acc"].append(all_train_true_acc if math.isfinite(all_train_true_acc) else 0.) performance["valid_acc"].append(all_valid_acc if math.isfinite(all_valid_acc) else 0.) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) # logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) # logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) # logger.logkv('time_elapsed', tnow - tfirststart) # for (lossval, lossname) in zip(lossvals, model.loss_names): # logger.logkv(lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): model_dir = osp.join(base_path, "models") os.makedirs(model_dir, exist_ok=True) savepath = osp.join(model_dir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) print("Saved model successfully.") if use_feedback: performance_fname = os.path.join(base_path, "performance.p") with open(performance_fname, "wb") as f: pickle.dump(performance, f) env.close() return performance
def rollout(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, num_steps, num_envs, env_name, num_levels, start_level, distribution_mode, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) # if isinstance(lr, float): lr = constfn(lr) # else: assert callable(lr) # if isinstance(cliprange, float): cliprange = constfn(cliprange) # else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) # Instantiate the runner object # runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) # if eval_env is not None: # eval_runner = Runner(env = eval_env, model = model, nsteps = nsteps, gamma = gamma, lam= lam) # epinfobuf = deque(maxlen=100) # if eval_env is not None: # eval_epinfobuf = deque(maxlen=100) # if init_fn is not None: # init_fn() # # Start total timer # tfirststart = time.perf_counter() # nupdates = total_timesteps//nbatch # for update in range(1, nupdates+1): # assert nbatch % nminibatches == 0 # # Start timer # tstart = time.perf_counter() # frac = 1.0 - (update - 1.0) / nupdates # # Calculate the learning rate # lrnow = lr(frac) # # Calculate the cliprange # cliprangenow = cliprange(frac) # if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # # Get minibatch # obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 # if eval_env is not None: # eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632 # if update % log_interval == 0 and is_mpi_root: logger.info('Done.') # epinfobuf.extend(epinfos) # if eval_env is not None: # eval_epinfobuf.extend(eval_epinfos) # # Here what we're going to do is for each minibatch calculate the loss and append it. # mblossvals = [] # if states is None: # nonrecurrent version # # Index of each element of batch_size # # Create the indices array # inds = np.arange(nbatch) # for _ in range(noptepochs): # # Randomize the indexes # np.random.shuffle(inds) # # 0 to batch_size with batch_train_size step # for start in range(0, nbatch, nbatch_train): # end = start + nbatch_train # mbinds = inds[start:end] # slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) # mblossvals.append(model.train(lrnow, cliprangenow, *slices)) # else: # recurrent version # assert nenvs % nminibatches == 0 # envsperbatch = nenvs // nminibatches # envinds = np.arange(nenvs) # flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) # for _ in range(noptepochs): # np.random.shuffle(envinds) # for start in range(0, nenvs, envsperbatch): # end = start + envsperbatch # mbenvinds = envinds[start:end] # mbflatinds = flatinds[mbenvinds].ravel() # slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) # mbstates = states[mbenvinds] # mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) # # Feedforward --> get losses --> update # lossvals = np.mean(mblossvals, axis=0) # # End timer # tnow = time.perf_counter() # # Calculate the fps (frame per second) # fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) rewards = [] for i in range(num_steps): env = ProcgenEnv(num_envs=num_envs, env_name=env_name, num_levels=num_levels, start_level=start_level, distribution_mode=distribution_mode) env = VecExtractDictObs(env, "rgb") env = VecMonitor( venv=env, filename=None, keep_buf=100, ) env = VecNormalize(venv=env, ob=False) obs = env.reset() done = False reward = 0.0 timesteps = 0 while not done: # action = env.action_space.sample() # print("example of an action: ", action) # print("\n\n") # print("my action: ") actions, _, _, _ = model.step(obs) # print(actions.shape) # print("obs shape: ", obs.shape) # print(actions[0]) obs, r, done, _ = env.step(actions[0]) done = done.all() reward += r timesteps += 1 rewards.append(reward) #Logging reward, timesteps, and numsteps logger.logkv("numsteps", i) logger.logkv("timesteps", timesteps) logger.logkv("episode_reward_mean", safemean(reward)) logger.dumpkvs() # if update % log_interval == 0 or update == 1: # # Calculates if value function is a good predicator of the returns (ev > 1) # # or if it's just worse than predicting nothing (ev =< 0) # ev = explained_variance(values, returns) # logger.logkv("misc/serial_timesteps", update*nsteps) # logger.logkv("misc/nupdates", update) # logger.logkv("misc/total_timesteps", update*nbatch) # logger.logkv("fps", fps) # logger.logkv("misc/explained_variance", float(ev)) # logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) # logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) # if eval_env is not None: # logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) ) # logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) ) # logger.logkv('misc/time_elapsed', tnow - tfirststart) # for (lossval, lossname) in zip(lossvals, model.loss_names): # logger.logkv('loss/' + lossname, lossval) # logger.dumpkvs() # if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: # checkdir = osp.join(logger.get_dir(), 'checkpoints') # os.makedirs(checkdir, exist_ok=True) # savepath = osp.join(checkdir, '%.5i'%update) # print('Saving to', savepath) # model.save(savepath) return model
def learn(policy, env, ranking_buffer, args, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None): seed = args.seed batch_size = args.batch_size nsteps = args.nsteps total_timesteps = int(args.num_timesteps * 1.1) lr = args.lr set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env=env, model=model, ranking_buffer=ranking_buffer, nsteps=nsteps, gamma=gamma, lam=lam, batch_size=batch_size) epinfobuf = deque(maxlen=100) tfirststart = time.time() sl_next = 1 nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) # Judge whether/how frequent we do SL if args.disable_rapid: do_sl = False do_buffer = False else: if update * nbatch < args.sl_until and update >= sl_next: do_sl = True do_buffer = True next_gap = int(1 / (1.0 - update * nbatch / args.sl_until)) sl_next += next_gap elif update * nbatch < args.sl_until: do_sl = False do_buffer = True else: do_sl = False do_buffer = False obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( do_buffer, do_sl, args.sl_num, lrnow) epinfobuf.extend(epinfos) mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append( model.train(args.train_rl, lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) logger.logkv('episodes', runner.episodes_count) logger.record_tabular("rapid_loss", float(runner.rapid_loss)) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) env.close() return model
def learn(*, agent_str, use_netrand, network, sess, env, nsteps, total_timesteps, ent_coef, lr, arch='impala', use_batch_norm=True, dropout=0, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, save_path=None, load_path=None, **network_kwargs): aug_func = AUG_FUNCS[agent_str] if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches if use_netrand: policy = RandomCnnPolicy Model = RandomModel else: policy = CnnPolicy Model = BaseModel model = Model(policy=policy, sess=sess, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, arch=arch, use_batch_norm=use_batch_norm, dropout=dropout) if load_path is not None: model.load(load_path) logger.info("Model pramas loaded from save") runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, aug_func=aug_func) epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps//nbatch logger.info("Running {} updates, each needs {} batches".format(nupdates, nbatch)) mean_rewards = [] datapoints = [] run_t_total = 0 train_t_total = 0 if use_netrand: init_rand = tf.variables_initializer([v for v in tf.global_variables() if 'randcnn' in v.name]) for update in range(1, nupdates+1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) run_tstart = time.time() if use_netrand: sess.run(init_rand) clean_flag = np.random.rand(1)[0] < use_netrand else: clean_flag = 0 obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(clean_flag) epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) run_elapsed = time.time() - run_tstart run_t_total += run_elapsed mblossvals = [] logger.info('update: {} updating parameters...'.format(update)) train_tstart = time.time() if states is None: inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) if clean_flag: mblossvals.append(model.clean_train(lrnow, cliprangenow, *slices)) else: mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: assert nenvs % nminibatches == 0 envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) # update the dropout mask sess.run([model.train_model.dropout_assign_ops]) train_elapsed = time.time() - train_tstart train_t_total += train_elapsed lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: step = update*nbatch rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10]) rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100]) ep_len_mean_10 = np.nanmean([epinfo['l'] for epinfo in epinfobuf10]) ep_len_mean_100 = np.nanmean([epinfo['l'] for epinfo in epinfobuf100]) mean_rewards.append(rew_mean_10) datapoints.append([step, rew_mean_10]) mean_rewards.append(rew_mean_10) logger.logkv('eprew10', rew_mean_10) logger.logkv('eprew100', rew_mean_100) logger.logkv('eplenmean10', ep_len_mean_10) logger.logkv('eplenmean100', ep_len_mean_100) logger.logkv('nupdate', update) logger.logkv('misc/total_time_elapsed', tnow - tfirststart) logger.logkv('misc/run_t_total', run_t_total) logger.logkv('misc/train_t_total', train_t_total) logger.logkv("misc/total_timesteps", update*nbatch) logger.logkv("misc/serial_timesteps", update*nsteps) logger.logkv("fps", fps) if len(mblossvals): for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) logger.dumpkvs() if save_path: model.save(save_path) env.close() return model
def learn(network, FLAGS, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=10, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, episode_window_size=20, stop=True, scenario='gfootball.scenarios.1_vs_1_easy', curriculum=np.linspace(0, 0.9, 10), a=0, b=0, num_timesteps=200000, eval_period=20, eval_episodes=1, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) basic_builder = importlib.import_module(scenario, package=None) def build_builder_with_difficulty(difficulty): def builder_with_difficulty(builder): basic_builder.build_scenario(builder) builder.config().right_team_difficulty = difficulty builder.config().left_team_difficulty = difficulty return builder_with_difficulty def create_single_football_env(iprocess): """Creates gfootball environment.""" env = football_env.create_environment( env_name=build_builder_with_difficulty(0), stacked=('stacked' in FLAGS.state), rewards=FLAGS.reward_experiment, logdir=logger.get_dir(), write_goal_dumps=FLAGS.dump_scores and (iprocess == 0), write_full_episode_dumps=FLAGS.dump_full_episodes and (iprocess == 0), render=FLAGS.render and (iprocess == 0), dump_frequency=50 if FLAGS.render and iprocess == 0 else 0) env = monitor.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(iprocess))) return env env = SubprocVecEnv([(lambda _i=i: create_single_football_env(_i)) for i in range(FLAGS.num_envs)], context=None) policy = build_policy(env, network, **network_kwargs) average_window_size = episode_window_size * 16 # Get the nb of env nenvs = FLAGS.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Configure logger to log_ppo_timestamp formatted pickle_str = 'curriculum_a-%db-%d_ppo_impala_chkpt' % (a, b) + '-'.join( str(datetime.datetime.now()).replace(':', ' ').split(' ')) eval_pickle_str = pickle_str + '_eval' # open pickle file to append relevant data in binary pickle_dir = '/content/cs285_f2020_proj/football/pickled_data/' model_dir = '/content/cs285_f2020_proj/football/models/' # create dir for pickling & model save if not os.path.exists(pickle_dir): os.makedirs(pickle_dir) if not os.path.exists(model_dir): os.makedirs(model_dir) def make_file(file_path): if not os.path.exists(file_path): with open(file_path, 'w+'): print('made path', file_path) make_file(pickle_dir + pickle_str) make_file(pickle_dir + eval_pickle_str) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight) if load_path is not None: model.load(load_path) def create_single_football_env(iprocess, difficulty): """Creates gfootball environment.""" env = football_env.create_environment( env_name=build_builder_with_difficulty(difficulty), stacked=('stacked' in FLAGS.state), rewards=FLAGS.reward_experiment, logdir=logger.get_dir(), write_goal_dumps=FLAGS.dump_scores and (iprocess == 0), write_full_episode_dumps=FLAGS.dump_full_episodes and (iprocess == 0), render=FLAGS.render and (iprocess == 0), dump_frequency=50 if FLAGS.render and iprocess == 0 else 0) env = monitor.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(iprocess))) return env def make_runner(difficulty): vec_env = SubprocVecEnv( [(lambda _i=i: create_single_football_env(_i, difficulty)) for i in range(FLAGS.num_envs)], context=None) print('vec env obs space', vec_env.observation_space) return env, Runner(env=vec_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) # get next difficulty according to distribution outlined in probabilities. def get_next_difficulty(): draw = np.random.choice(range(10), 1, p=curriculum_probabilities) return draw[0] # Instantiate the runner object # Curriculum difficulties start off as random. curriculum_probabilities = [0.1] * 10 difficulty_idx = get_next_difficulty() env, runner = make_runner(curriculum[difficulty_idx]) def make_eval_runner(difficulty): vec_env = SubprocVecEnv( [(lambda _i=i: create_single_football_env(_i, difficulty)) for i in range(FLAGS.num_envs, 2 * FLAGS.num_envs)], context=None) print('vec env obs space', vec_env.observation_space) return env, Runner(env=vec_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) policy = build_policy(env, network, **network_kwargs) eprews = [] rews_by_difficulty = [[] for i in range(10)] # for logging TEMP ep_vars = [0] * 10 lmeans = [] lvariances = [] rdi = 20 # reward difference interval, in episodes smart_mean = lambda l: np.mean(l) if l else 0 smart_var = lambda l: np.var(l) if l else 0 def update_curriculum_probabilities(): rdi_rew_means = np.array( [smart_mean(diffrew[-rdi:]) for diffrew in rews_by_difficulty]) rdi_rew_vars = np.array( [smart_var(diffrew[-rdi:]) for diffrew in rews_by_difficulty]) print('means', rdi_rew_means) print('variances', rdi_rew_vars) lmeans.extend(rdi_rew_means) lvariances.extend(rdi_rew_vars) e_diff_rews = np.exp(a * rdi_rew_means + b * rdi_rew_vars) return rdi_rew_vars, e_diff_rews / np.sum(e_diff_rews) # eval_rews[i] will be all the rewards from evaluation i # eval_rews[i][j] will be rewards from evaluation i at difficulty j ~ 2:20 = (0.05, 0.95) eval_rews = [] epinfobuf = deque(maxlen=100) if init_fn is not None: init_fn() # Start total timer tfirststart = time.perf_counter() # nupdates = total_timesteps//nbatch update = 0 while update * nsteps < num_timesteps: update += 1 assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() # frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(0) # Constant LR, cliprange # Calculate the cliprange cliprangenow = cliprange(0) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if update % log_interval == 0 and is_mpi_root: logger.info('Done.') rewards_this_episode = [i['r'] for i in epinfos] lengths_this_episode = [i['l'] for i in epinfos] print('episode rewards ep#', update, rewards_this_episode) eprews.extend(rewards_this_episode) epinfobuf.extend(epinfos) # for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # sum of last average_window_size rewards last_aws_rewards_sum = sum(eprews[-average_window_size:]) print('LAST %d ep mean reward' % episode_window_size, last_aws_rewards_sum / (average_window_size + 0.0)) rews_by_difficulty[difficulty_idx].append(np.sum(rewards_this_episode)) # for logging TEMP print('mean of means', np.mean(lmeans), 'var of means', np.var(lmeans)) print('mean of vars', np.mean(lvariances), 'var of vars', np.var(lvariances)) # pickling pickle_data = { 'episode': update, 'timesteps': update * nsteps, 'episode_rewards': rewards_this_episode, 'episode_window_size': episode_window_size, # 'last_window_size_rewards' : eprews[-average_window_size:], 'difficulty': curriculum[difficulty_idx], 'len_rewards_array': len(eprews), 'episode_lenths': lengths_this_episode, 'eval_period': eval_period, 'probabilities': curriculum_probabilities, 'running_ws_variance': ep_vars, 'a': a, 'b': b, } def dict_print(d): for k in d: print(k, d[k]) dict_print(pickle_data) with open(pickle_dir + pickle_str, 'ab') as pickle_file: pickle.dump(pickle_data, pickle_file) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) # every eval period run for eval_nsteps on every difficulty if update % eval_period == 1: # rews[i] = sum of rewards from eval_nsteps for difficulty index i eval_rews_period = [] # 2D array eval_rews_period_sum = [] # 1D array for difficulty_eval in curriculum: eval_env, eval_runner = make_eval_runner(difficulty_eval) eval_rewards_for_difficulty = [] for k in range(eval_episodes): # run nsteps for the number of eval episodes (nsteps * episodes) eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 # append the array of all the rewards gotten for this difficulty in the episode. eval_rewards_for_difficulty.extend( [i['r'] for i in eval_epinfos]) eval_rews_period.append(eval_rewards_for_difficulty) eval_rews_period_sum.append(sum(eval_rewards_for_difficulty)) print("rews eval timstep", update * nsteps, "difficulty", difficulty_eval, eval_rewards_for_difficulty, "sum", eval_rews_period_sum[-1]) eval_rews.append(eval_rews_period) eval_pickle_data = [ update * nsteps, # timesteps for trainer eval_rews_period, # 2D array which contains all rewards gotten for all difficulties this eval period. eval_rews_period_sum ] with open(pickle_dir + eval_pickle_str, 'ab') as eval_pickle_file: pickle.dump(eval_pickle_data, eval_pickle_file) print('eval pickle dumped, u#', update) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) logger.dumpkvs() if save_interval and update % save_interval == 1: savepath = osp.join(model_dir, pickle_str) print('Saving to', savepath) model.save(savepath) ep_vars, curriculum_probabilities = update_curriculum_probabilities() print('new probability distr:', curriculum_probabilities) difficulty_idx = get_next_difficulty() print("NEXT DIFFICULTY:", curriculum[difficulty_idx]) env, runner = make_runner(curriculum[difficulty_idx]) return model
def learn(*, network, sess, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, save_path=None, load_path=None, **network_kwargs): comm = MPI.COMM_WORLD rank = comm.Get_rank() mpi_size = comm.Get_size() #sess = tf.get_default_session() # tb_writer = TB_Writer(sess) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches policy = CrossCnnPolicy model = Model(policy=policy, sess=sess, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) # utils.load_all_params(sess) if load_path is not None: model.load(load_path) logger.info("Model pramas loaded from save") runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) logger.info("Initilizing runner") epinfobuf10 = deque(maxlen=10) epinfobuf100 = deque(maxlen=100) tfirststart = time.time() active_ep_buf = epinfobuf100 nupdates = total_timesteps // nbatch logger.info("Running {} updates, each needs {} batches".format( nupdates, nbatch)) mean_rewards = [] datapoints = [] run_t_total = 0 train_t_total = 0 can_save = True checkpoints = list(range(0, 2049, 10)) saved_key_checkpoints = [False] * len(checkpoints) #init_rand = tf.variables_initializer([v for v in tf.global_variables() if 'randcnn' in v.name]) # if Config.SYNC_FROM_ROOT and rank != 0: # can_save = False # def save_model(base_name=None): # base_dict = {'datapoints': datapoints} # utils.save_params_in_scopes( # sess, ['model'], Config.get_save_file(base_name=base_name), base_dict) for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) #logger.info('collecting rollouts...') run_tstart = time.time() obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) epinfobuf10.extend(epinfos) epinfobuf100.extend(epinfos) run_elapsed = time.time() - run_tstart run_t_total += run_elapsed #logger.info('rollouts complete') mblossvals = [] logger.info('update: {} updating parameters...'.format(update)) train_tstart = time.time() if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # update the dropout mask sess.run([model.train_model.dropout_assign_ops]) train_elapsed = time.time() - train_tstart train_t_total += train_elapsed #ogger.info('update complete') lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: step = update * nbatch #rew_mean_10 = utils.process_ep_buf(active_ep_buf, tb_writer=tb_writer, suffix='', step=step) rew_mean_10 = safemean([epinfo['r'] for epinfo in epinfobuf10]) rew_mean_100 = safemean([epinfo['r'] for epinfo in epinfobuf100]) ep_len_mean_10 = np.nanmean( [epinfo['l'] for epinfo in epinfobuf10]) ep_len_mean_100 = np.nanmean( [epinfo['l'] for epinfo in epinfobuf100]) logger.info('\n----', update) mean_rewards.append(rew_mean_10) datapoints.append([step, rew_mean_10]) mean_rewards.append(rew_mean_10) logger.logkv('eprew10', rew_mean_10) logger.logkv('eprew100', rew_mean_100) logger.logkv('eplenmean10', ep_len_mean_10) logger.logkv('eplenmean100', ep_len_mean_100) logger.logkv('nupdate', update) #logger.info('time_elapsed', tnow - tfirststart, run_t_total, train_t_total) logger.logkv('misc/total_time_elapsed', tnow - tfirststart) logger.logkv('misc/run_t_total', run_t_total) logger.logkv('misc/train_t_total', train_t_total) #logger.info('timesteps', update*nsteps, total_timesteps) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("misc/serial_timesteps", update * nsteps) #logger.info('fps', fps) logger.logkv("fps", fps) if len(mblossvals): for (lossval, lossname) in zip(lossvals, model.loss_names): logger.info(lossname, lossval) #tb_writer.log_scalar(lossval, lossname) logger.logkv('loss/' + lossname, lossval) logger.info('----\n') logger.dumpkvs() #if can_save: if 0: ## not doing checkpoint saving yet if save_interval and (update % save_interval == 0): save_model() for j, checkpoint in enumerate(checkpoints): if (not saved_key_checkpoints[j]) and (step >= (checkpoint * 1e6)): saved_key_checkpoints[j] = True save_model(str(checkpoint) + 'M') # save_model() if save_path: model.save(save_path) env.close() return model
def update(self): if self.update_index > self.nupdates: return False assert self.nbatch % self.nminibatches == 0 self.nbatch_train = self.nbatch // self.nminibatches tstart = time.time() frac = 1.0 - (self.update_index - 1.0) / self.nupdates lrnow = self.lr(frac) cliprangenow = self.cliprange(frac) obs, returns, masks, actions, values, neglogpacs, states, epinfos = self.runner.run( ) #pylint: disable=E0632 self.epinfobuf.extend(epinfos) mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(self.nbatch) for _ in range(self.noptepochs): np.random.shuffle(inds) for start in range(0, self.nbatch, self.nbatch_train): end = start + self.nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append( self.model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert self.nenvs % self.nminibatches == 0 envsperbatch = self.nenvs // self.nminibatches envinds = np.arange(self.nenvs) flatinds = np.arange(self, nenvs * self.nsteps).reshape( self.nenvs, self.nsteps) envsperbatch = self.nbatch_train // self.nsteps for _ in range(self.noptepochs): np.random.shuffle(envinds) for start in range(0, self.nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(self.nbatch / (tnow - tstart)) if self.update_index % self.log_interval == 0 or self.update_index == 1: ev = explained_variance(values, returns) logger.logkv("serial_timesteps", self.update_index * self.nsteps) logger.logkv("nupdates", self.update_index) logger.logkv("total_timesteps", self.update_index * self.nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in self.epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in self.epinfobuf])) logger.logkv('time_elapsed', tnow - self.tfirststart) logger.logkv('agent', self.scope) for (lossval, lossname) in zip(lossvals, self.model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() if self.save_interval and (self.update % self.save_interval == 0 or self.update_index == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % self.update_index) print('Saving to', savepath) self.model.save(savepath) self.update_index += 1 self.min_reward = safemin([epinfo['r'] for epinfo in self.epinfobuf]) self.max_reward = safemax([epinfo['r'] for epinfo in self.epinfobuf]) return True
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=16, noptepochs=4, cliprange=0.2, save_interval=0, nddpgbatches=32, ddpg_per_ppo=128, target_lag=1, ddpg_ac_weight=0.1, annealing_updates=50, with_ddpg=True, with_annealing=True): global use_ddpg global use_annealing use_ddpg = with_ddpg use_annealing = with_annealing if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, batch_size=nddpgbatches) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps // nbatch print('nupdates', nupdates) ddpg_w = ddpg_ac_weight if use_annealing else 0.0 for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 if ddpg_w > 0.0: ddpg_w -= 1 / float(annealing_updates) * ddpg_w values_list = [] nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) obs, returns, rewards, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 if use_annealing: ddpg_ac_list = [] for idx in range(obs.shape[0]): ddpg_ac, _ = model.agent.pi(obs[idx], apply_noise=False, compute_Q=False) ddpg_ac_list.append(ddpg_ac) ddpg_ac = np.asarray(ddpg_ac_list) values_list.append(values) # print('obs.shape', obs.shape, 'rewards.shape', returns.shape, 'masks.shape', masks.shape, 'actions.shape', actions.shape) epinfobuf.extend(epinfos) mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) if not use_annealing: mblossvals.append( model.train(lrnow, cliprangenow, *slices)) else: mblossvals.append( model.train(lrnow, cliprangenow, *slices, ddpg_acs=ddpg_ac[mbinds], ddpg_w=ddpg_w)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] # mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) if not use_annealing: mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) else: mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates, ddpg_acs=ddpg_ac[mbinds], ddpg_w=ddpg_w)) if use_ddpg: mbcritic_loss = [] mbactor_loss = [] # ------------- train DDPG ---------------- for _ in range(ddpg_per_ppo * noptepochs * nminibatches): cl, al = model.agent.train() mbcritic_loss.append(cl) mbactor_loss.append(al) if update > target_lag: model.agent.update_target_net() # print('noptepochs', noptepochs, 'nbatch_train', nbatch_train, 'nbatch', nbatch) # ------------- train DDPG ---------------- lossvals = np.mean(mblossvals, axis=0) values_avg = np.mean(values_list) if use_ddpg: critic_loss = np.mean(mbcritic_loss) actor_loss = np.mean(mbactor_loss) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) logger.logkv('value estimation', values_avg) logger.logkv('eprew_max', np.max(mblossvals)) logger.logkv('eprew_min', np.min(mblossvals)) logger.logkv('eprew_std', np.std(mblossvals)) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if use_ddpg: logger.logkv('critic_loss', critic_loss) logger.logkv('actor_loss', actor_loss) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) env.close()
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_model, load_model_path, save_model, save_model_path): if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) #nenvs = env.num_envs nenvs=1 ob_space = Box(low=0, high=1, shape=(84, 84, 4)) ac_space = Discrete(8) nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if load_model: print('loading model') make_model.load(load_model_path) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model print('test0') runner = Runner(env=env, model=model, ob_space=ob_space, nsteps=nsteps, gamma=gamma, lam=lam) print('test1') epinfobuf = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps//nbatch print('nbatch = ', nbatch) print('test2') for update in range(1, nupdates+1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) print('test4') obs, rewards, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 print('test5') print('size of returns is', returns.shape) np.reshape(obs, (nsteps, 84, 84, 4)) obs.shape = (nsteps, 84, 84, 4) print('obs.shape= ', obs.shape) rewards.shape = (nsteps,) returns.shape = (nsteps,) masks.shape = (nsteps,) actions.shape = (nsteps,) values.shape = (nsteps,) neglogpacs.shape = (nsteps,) epinfobuf.extend(epinfos) mblossvals = [] inds = np.arange(nbatch) print('len(inds)=nbatch=', len(inds)) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) # slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) print('obs.shape=', obs.shape) print('returns.shape=', returns.shape) print('masks.shape=', masks.shape) print('actions.shape=', actions.shape) print('values.shape=', values.shape) print('neglogpacs.shape=', neglogpacs.shape) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update*nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) #print('Saving model to', save_model_path) #model.save(save_model_path) #print('Saving to', savepath) #model.save(savepath) print('test4') env.close()
def learn(*, network, env, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if save_interval and logger.get_dir( ) and False: # Added the false because saving make_model threw "TypeError: Pickling an AuthenticationString object is disallowed for security reasons" import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: print(make_model) fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): full_start_time = time.time() assert nbatch % nminibatches == 0 tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) start_env_step_time = time.time() obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 end_env_step_time = time.time() epinfobuf.extend(epinfos) mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) start_train_time = time.time() for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) end_train_time = time.time() else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) full_end_time = time.time() if update % log_interval == 0 or update == 1: print('Full time: ', full_end_time - full_start_time, "s") print('Env step time: ', end_env_step_time - start_env_step_time, "s") print('Train time: ', end_train_time - start_train_time, "s") if safemean([epinfo['r'] for epinfo in epinfobuf]) > 3: print([epinfo['r'] for epinfo in epinfobuf]) #break ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and ( update % save_interval == 0 or update == 1) and logger.get_dir() and MPI.COMM_WORLD.Get_rank() == 0: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) env.close() return model
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, restore_path, save_path, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0): if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) model = make_model() runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps // nbatch saver = tf.train.Saver() if restore_path: saver.restore(model.sess, restore_path) print("Model restored from file:", restore_path) for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) # pylint: disable=E0632 epinfobuf.extend(epinfos) mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) # logger.logkv("serial_timesteps", update * nsteps) # logger.logkv("nupdates", update) # logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('reward', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('successes', safemean([epinfo['s'] for epinfo in epinfobuf])) logger.logkv('episode_steps', safemean([epinfo['l'] for epinfo in epinfobuf])) # logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and save_path: saver.save(model.sess, save_path) print("Model saved in file:", save_path) env.close()
def learn(*, network, env, total_timesteps, dtarg=0.01, adaptive_kl=0, trunc_rho=1.0, clipcut=0.2, useadv=0, vtrace=0, rgae=0, eval_env=None, seed=None, ERlen=1, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=None, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space acdim = ac_space.shape[0] # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, adaptive_kl=adaptive_kl) model = make_model() if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = EvalRunner(env=eval_env, model=model, nsteps=10 * nsteps, gamma=gamma, lam=lam) eval_runner.obfilt = runner.obfilt eval_runner.rewfilt = runner.rewfilt epinfobuf = deque(maxlen=10) if eval_env is not None: eval_epinfobuf = deque(maxlen=10) # Start total timer tfirststart = time.time() nupdates = total_timesteps // nbatch def add_vtarg_and_adv(seg, gamma, value, lam): """ Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) """ done = np.append( seg["done"], 0 ) # last element is only used for last vtarg, but we already zeroed it if last new = 1 T = len(seg["rew"]) gaelam = np.empty(T, 'float32') rew = runner.rewfilt(seg["rew"]) lastgaelam = 0 for t in reversed(range(T)): nonterminal = 1 - done[t + 1] delta = rew[t] + gamma * value[t + 1] * nonterminal - value[t] gaelam[ t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam ret = gaelam + value[:-1] return gaelam, ret def add_vtarg_and_adv_vtrace(seg, gamma, value, rho, trunc_rho, acdim=None): """ Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) """ done = np.append( seg["done"], 0 ) # last element is only used for last vtarg, but we already zeroed it if last new = 1 rho_ = np.append(rho, 1.0) if acdim is not None: rho_ = np.exp(np.log(rho_) / acdim) r = np.minimum(trunc_rho, rho_) c = lam * np.minimum(1.0, rho_) T = len(seg["rew"]) gaelam = np.empty(T, 'float32') gaelam2 = np.empty(T, 'float32') rew = runner.rewfilt(seg["rew"]) lastgaelam = 0 for t in reversed(range(T)): nonterminal = 1 - done[t + 1] delta = (rew[t] + gamma * value[t + 1] * nonterminal - value[t]) gaelam[t] = delta + gamma * lam * nonterminal * lastgaelam lastgaelam = r[t] * gaelam[t] ret = r[:-1] * gaelam + value[:-1] adv = rew + gamma * (1.0 - done[1:]) * np.hstack([ret[1:], value[T] ]) - value[:-1] return adv, ret, gaelam def add_vtarg_and_adv_vtrace4(seg, gamma, value, rho, trunc_rho, acdim=None): """ Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) """ done = np.append( seg["done"], 0 ) # last element is only used for last vtarg, but we already zeroed it if last new = 1 rho_ = np.append(rho, 1.0) if acdim is not None: rho_ = np.exp(np.log(rho_) / acdim) T = len(seg["rew"]) gaelam = np.zeros(T, 'float32') rew = runner.rewfilt(seg["rew"]) delta = (rew + gamma * value[1:] * (1.0 - done[1:]) - value[:-1]) gamlam = np.zeros(T, 'float32') for i in range(T): gamlam[i] = (gamma * lam)**i idx = T c = np.ones(T) for t in reversed(range(T)): # print(delta2) for j in range(t, T): if done[j + 1]: idx = j + 1 break gaelam[t] = np.sum(gamlam[:idx - t] * (np.minimum(1.0, c) * delta)[t:idx]) c[t:] = rho_[t] * c[t:] ret = np.minimum(trunc_rho, rho_[:-1]) * gaelam + value[:-1] adv = rew + gamma * (1.0 - done[1:]) * np.hstack([ret[1:], value[T] ]) - value[:-1] return adv, ret, gaelam seg = None cliprangenow = cliprange(1.0) klconst = 1.0 for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange # Get minibatch if seg is None: prev_seg = seg seg = {} else: prev_seg = {} for i in seg: prev_seg[i] = np.copy(seg[i]) seg["ob"], seg["rew"], seg["done"], seg["ac"], seg["neglogp"], seg[ "mean"], seg[ "logstd"], final_obs, final_done, epinfos = runner.run() #pylint: disable=E0632 # print(np.shape(seg["ob"])) if prev_seg is not None: for key in seg: if len(np.shape(seg[key])) == 1: seg[key] = np.hstack([prev_seg[key], seg[key]]) else: seg[key] = np.vstack([prev_seg[key], seg[key]]) if np.shape(seg[key])[0] > ERlen * nsteps: seg[key] = seg[key][-ERlen * nsteps:] ob_stack = np.vstack([seg["ob"], final_obs]) values = model.values(runner.obfilt(ob_stack)) values[:-1] = (1.0 - final_done) * values[:-1] ob = runner.obfilt(seg["ob"]) mean_now, logstd_now = model.meanlogstds(ob) # print(np.shape(seg["ac"])[1]) neglogpnow = 0.5 * np.sum(np.square((seg["ac"] - mean_now) / np.exp(logstd_now)), axis=-1) \ + 0.5 * np.log(2.0 * np.pi) * np.shape(seg["ac"])[1] \ + np.sum(logstd_now, axis=-1) neglogpold = 0.5 * np.sum(np.square((seg["ac"] - seg["mean"]) / np.exp(logstd_now)), axis=-1) \ + 0.5 * np.log(2.0 * np.pi) * np.shape(seg["ac"])[1] \ + np.sum(logstd_now, axis=-1) rho = np.exp(-neglogpnow + neglogpold) # print(len(mean_now)) # print(cliprangenow) # print(rho) if vtrace == 1: adv, ret, gae = add_vtarg_and_adv_vtrace(seg, gamma, values, rho, trunc_rho) if useadv: gae = adv elif vtrace == 4: adv, ret, gae = add_vtarg_and_adv_vtrace4(seg, gamma, values, rho, trunc_rho) if useadv: gae = adv else: gae, ret = add_vtarg_and_adv(seg, gamma, values, lam) r = np.minimum(1.0, rho) r_gae = gae * r print("======") print(gae) print(r_gae) print(gae.mean()) print(r_gae.mean()) print(gae.std()) print(r_gae.std()) print(r.mean()) print("======") if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, _, _, eval_epinfos = eval_runner.run( ) #pylint: disable=E0632 prior_row = np.zeros(len(seg["ob"])) temp_ = [] for i in range(int(len(prior_row) / nsteps)): temp_row = np.mean( np.abs(rho[i * nsteps:(i + 1) * nsteps] - 1.0) + 1.0) # local_rho[i + (ERlen-int(len(prior_row)/nsteps))].append(temp_row) temp_.append(temp_row) print(temp_) rho_after = np.exp(- 0.5 * np.square((seg["ac"] - mean_now) / np.exp(logstd_now)) \ + 0.5 * np.square((seg["ac"] - seg["mean"]) / np.exp(logstd_now))) temp_prior = [] for i in range(int(len(prior_row) / nsteps)): temp_row = np.mean( np.abs(rho_after[i * nsteps:(i + 1) * nsteps] - 1.0) + 1.0) # local_rho[i + (ERlen-int(len(prior_row)/nsteps))].append(temp_row) if temp_row > 1 + clipcut: prior_row[i * nsteps:(i + 1) * nsteps] = 0 else: prior_row[i * nsteps:(i + 1) * nsteps] = 1 # prior_row[i * nsteps:(i + 1) * nsteps] = 1 temp_prior.append(temp_row) print(temp_prior) # for i in range(len(prior_row)): # if (np.abs(rho[i] - 1.0) + 1.0)>1.05: # prior_row[i]=0 # else: # prior_row[i]=1 # for i in range(len(prior_row)): # if rho[i]>1.1 : # prior_row[i]=0 # else: # prior_row[i]=1 # prob = prior_row/np.sum(prior_row) print(np.sum(prior_row)) epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] # Index of each element of batch_size # Create the indices array inds1 = np.arange(len(seg["ob"]) - nsteps) inds2 = np.arange(nsteps) + len(seg["ob"]) - nsteps print(len(seg["ob"])) print(cliprangenow) nbatch_adapt1 = int( (np.sum(prior_row) - nsteps) / nsteps * nbatch_train) nbatch_adapt2 = int((nsteps) / nsteps * nbatch_train) print(rho) idx1 = [] idx2 = [] kl_rest = np.ones(len(seg["ob"])) * np.sum(prior_row) / nsteps kl_rest[:-nsteps] = 0 # print(kl_rest) for _ in range(noptepochs): # Randomize the indexes # np.random.shuffle(inds) # 0 to batch_size with batch_train_size step # print(nbatch_adapt) losses_epoch = [] for _ in range(int(nsteps / nbatch_train)): if nbatch_adapt1 > 0: idx1 = np.random.choice(inds1, nbatch_adapt1, p=prior_row[:-2048] / np.sum(prior_row[:-2048])) idx2 = np.random.choice(inds2, nbatch_adapt2) # print(np.mean(np.abs(rho[mbinds] - 1.0) + 1.0)) idx = np.hstack([idx1, idx2]).astype(int) slices = (arr[idx] for arr in (ob, ret, gae, seg["done"], seg["ac"], values[:-1], neglogpold, seg["mean"], logstd_now, kl_rest, rho, neglogpnow)) loss_epoch = model.train(lrnow, cliprangenow, klconst, rgae, trunc_rho, *slices) mblossvals.append(loss_epoch) losses_epoch.append(loss_epoch) # # print(np.mean(losses_epoch, axis=0)) # mean_n, logstd_n = model.meanlogstds(runner.obfilt(seg["ob"])) # # print(np.shape(seg["ac"])[1]) # rho_after = np.exp(- 0.5 * np.square((seg["ac"] - mean_n) / np.exp(logstd_n)) \ # - logstd_n + 0.5 * np.square((seg["ac"] - seg["mean"]) / np.exp(seg["logstd"]))\ # + seg["logstd"]) # temp_ = [] # for i in range(int(len(prior_row) / nsteps)): # temp_row = np.mean(np.abs(rho_after[i * nsteps:(i + 1) * nsteps] - 1.0) + 1.0) # # local_rho[i + (ERlen-int(len(prior_row)/nsteps))].append(temp_row) # temp_.append(temp_row) # print(temp_) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) if adaptive_kl: print("KL avg :", lossvals[3]) if lossvals[3] > dtarg * 1.5: klconst *= 2 print("kl const is increased") elif lossvals[3] < dtarg / 1.5: klconst /= 2 print("kl const is reduced") klconst = np.clip(klconst, 2**(-10), 64) # End timer tnow = time.time() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values[:-1], ret) logger.logkv("batch IS weight", [int(1000 * s) / 1000. for s in np.array(temp_prior)]) logger.logkv("kl const", klconst) logger.logkv("clipping factor", cliprangenow) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv( 'eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfos])) logger.logkv( 'eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfos])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and ( MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) return model
def print_log(*, model, run_info, batching_config, lossvals, update, fps, epinfobuf, tnow, tfirststart): ev = explained_variance(run_info.values, run_info.returns) logger.logkv("serial_timesteps", update * batching_config.nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * batching_config.nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs()
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, num_casks=0): if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs - num_casks ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=env.num_envs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) # load running mean std checkdir = load_path[0:-5] checkpoint = int(load_path.split('/')[-1]) if osp.exists(osp.join(checkdir, '%.5i_ob_rms.pkl' % checkpoint)): with open(osp.join(checkdir, '%.5i_ob_rms.pkl' % checkpoint), 'rb') as ob_rms_fp: env.ob_rms = pickle.load(ob_rms_fp) # if osp.exists(osp.join(checkdir, '%.5i_ret_rms.pkl' % checkpoint)): # with open(osp.join(checkdir, '%.5i_ret_rms.pkl' % checkpoint), 'rb') as ret_rms_fp: # env.ret_rms = pickle.load(ret_rms_fp) # tensorboard writer = tf.summary.FileWriter(logger.get_dir(), tf.get_default_session().graph) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, writer=writer, num_casks=num_casks) epinfobuf = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps//nbatch for update in range(1, nupdates+1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 epinfobuf.extend(epinfos) mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update*nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('epsrewmean', safemean([epinfo['sr'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() # tensorboard summary = tf.Summary() summary.value.add(tag='iteration/reward_mean', simple_value=safemean([epinfo['r'] for epinfo in epinfobuf])) summary.value.add(tag='iteration/length_mean', simple_value=safemean([epinfo['l'] for epinfo in epinfobuf])) summary.value.add(tag='iteration/shaped_reward_mean', simple_value=safemean([epinfo['sr'] for epinfo in epinfobuf])) summary.value.add(tag='iteration/fps', simple_value=fps) writer.add_summary(summary, update) if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) print('Saving to', savepath) model.save(savepath) # save running mean std with open(osp.join(checkdir, '%.5i_ob_rms.pkl' % update), 'wb') as ob_rms_fp: pickle.dump(env.ob_rms, ob_rms_fp) with open(osp.join(checkdir, '%.5i_ret_rms.pkl' % update), 'wb') as ret_rms_fp: pickle.dump(env.ret_rms, ret_rms_fp) env.close()
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=200, useentr, net_size, load_path=None, i_trial, method): if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, vf_coef=vf_coef, max_grad_norm=max_grad_norm, net_size=net_size) if save_interval: import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path: model.load(load_path=load_path) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps//nbatch # ent_coef = max(ent_coef - 0.25*float(update) / float(nupdates), 0.001) ent_coef = useentr * ent_coef # ent_coef = entp - float(iters_so_far) / float(max_iters) for update in range(1, nupdates+1): # ent_coef = useentr * 0.01 # ent_coef = max(ent_coef - 0.25 * float(update) / float(nupdates), 0.001) assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 epinfobuf.extend(epinfos) mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices, ent_dynamic=ent_coef)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates, ent_dynamic=ent_coef)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update*nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('EpRewMean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('EpLenMean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) logger.logkv('trial', i_trial) logger.logkv("Iteration", update) logger.logkv('Name', method) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() # if update == 1 or update % 100==0 or update==nupdates: # rwd=runner.play(video_path=logger.get_dir()+'/videos', iters_so_far=update) # print('Average Retrun:{0}'.format(np.sum(rwd)/float(len(rwd)))) # print('Sum of Return:{0}'.format(np.sum(rwd))) if save_interval and (update % save_interval == 0 or update == 1 or update==nupdates) and logger.get_dir(): checkdir = get_dir(osp.join(logger.get_dir(), 'checkpoints')) savepath = osp.join(checkdir, '%.5i'%update) print('Saving to', savepath) model.save(savepath) # np.save('{}/mean'.format(checkdir + '/'), runner.env.obs.mean) # np.save('{}/var'.format(checkdir + '/'), runner.env.obs.var) env.close()