def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg seed = args.seed env_type, env_id = get_env_type(args) if env_type in {'atari', 'retro'}: if alg == 'deepq': env = make_env(env_id, env_type, seed=seed, wrapper_kwargs={'frame_stack': True}) elif alg == 'trpo_mpi': env = make_env(env_id, env_type, seed=seed) else: frame_stack_size = 4 env = make_vec_env(env_id, env_type, nenv, seed, gamestate=args.gamestate, reward_scale=args.reward_scale) env = VecFrameStack(env, frame_stack_size) else: config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) flatten_dict_observations = alg not in {'her'} env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=args.reward_scale, flatten_dict_observations=flatten_dict_observations) if env_type == 'mujoco': env = VecNormalize(env) return env
def profile_tf_runningmeanstd(): import time from baselines.common import tf_util tf_util.get_session( config=tf.ConfigProto( inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, allow_soft_placement=True )) x = np.random.random((376,)) n_trials = 10000 rms = RunningMeanStd() tfrms = TfRunningMeanStd() tic1 = time.time() for _ in range(n_trials): rms.update(x) tic2 = time.time() for _ in range(n_trials): tfrms.update(x) tic3 = time.time() print('rms update time ({} trials): {} s'.format(n_trials, tic2 - tic1)) print('tfrms update time ({} trials): {} s'.format(n_trials, tic3 - tic2)) tic1 = time.time() for _ in range(n_trials): z1 = rms.mean tic2 = time.time() for _ in range(n_trials): z2 = tfrms.mean assert z1 == z2 tic3 = time.time() print('rms get mean time ({} trials): {} s'.format(n_trials, tic2 - tic1)) print('tfrms get mean time ({} trials): {} s'.format(n_trials, tic3 - tic2)) '''
def add_all_summary(self, writer, values, iter): # Note that the order of the incoming ```values``` should be the same as the that of the # ```scalar_keys``` given in ```__init__``` if np.sum(np.isnan(values)+0) != 0: return sess = U.get_session() keys = self.scalar_summaries_ph + self.histogram_summaries_ph feed_dict = {} for k, v in zip(keys, values): feed_dict.update({k: v}) summaries_str = sess.run(self.summaries, feed_dict) writer.add_summary(summaries_str, iter)
def test_nonfreeze(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype('float32')) b = tf.Variable(np.random.randn(2,5).astype('float32')) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 # for some reason the session config with inter_op_parallelism_threads was causing # nested sess.run calls to freeze config = tf.ConfigProto(inter_op_parallelism_threads=1) sess = U.get_session(config=config) update_op = MpiAdamOptimizer(comm=MPI.COMM_WORLD, learning_rate=stepsize).minimize(loss) sess.run(tf.global_variables_initializer()) losslist_ref = [] for i in range(100): l,_ = sess.run([loss, update_op]) print(i, l) losslist_ref.append(l)
def __init__(self, epsilon=1e-4, shape=(), scope=''): sess = get_session() self._new_mean = tf.placeholder(shape=shape, dtype=tf.float64) self._new_var = tf.placeholder(shape=shape, dtype=tf.float64) self._new_count = tf.placeholder(shape=(), dtype=tf.float64) with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): self._mean = tf.get_variable('mean', initializer=np.zeros(shape, 'float64'), dtype=tf.float64) self._var = tf.get_variable('std', initializer=np.ones(shape, 'float64'), dtype=tf.float64) self._count = tf.get_variable('count', initializer=np.full((), epsilon, 'float64'), dtype=tf.float64) self.update_ops = tf.group([ self._var.assign(self._new_var), self._mean.assign(self._new_mean), self._count.assign(self._new_count) ]) sess.run(tf.variables_initializer([self._mean, self._var, self._count])) self.sess = sess self._set_mean_var_count()
def main(args): # configure logger, disable logging in child MPI processes (with rank > 0) arg_parser = common_arg_parser() args, unknown_args = arg_parser.parse_known_args(args) extra_args = parse_cmdline_kwargs(unknown_args) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 configure_logger(args.log_path) else: rank = MPI.COMM_WORLD.Get_rank() configure_logger(args.log_path, format_strs=[]) model, env = train(args, extra_args) if args.save_path is not None and rank == 0: save_path = osp.expanduser(args.save_path) model.save(save_path) if args.play: sess = get_session() model.load( '/home/yang/Documents/Work_Data/Research/rl/IEEE_ADPRL/ploted_fig/12veh/00100/00100checkpoint' ) import joblib, os, pickle import pandas as pd variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) ps = sess.run(variables) save_dict = {v.name: value for v, value in zip(variables, ps)} logger.log("Running trained model") mean = save_dict['ob_rms/mean:0'] var = save_dict['ob_rms/std:0'] env = env.venv obs_origin = env.reset() obs = np.clip((obs_origin - mean) / np.sqrt(var + 1e-8), -10., 10.) state = model.initial_state if hasattr(model, 'initial_state') else None dones = np.zeros((1, )) episode_rew = 0 obscollect = [] rewcollect = [] actcollect = [] envcollect = [] while True: if state is not None: actions, _, state, _ = model.step(obs, S=state, M=dones) else: actions, _, _, _ = model.step(obs) obs_origin, rew, done, _ = env.step(actions) obs = np.clip((obs_origin - mean) / np.sqrt(var + 1e-8), -10., 10.) episode_rew += rew[0] if isinstance(env, VecEnv) else rew env.render() uenv = env.envs[0].env.env.env env_temp = [ uenv.N, uenv.width, uenv.height, uenv.laneWidth, [[ uenv.vehs[i].endFlag, uenv.vehs[i].boundX, uenv.vehs[i].boundY, uenv.vehs[i].safeX, uenv.vehs[i].safeY, uenv.vehs[i].middlePoint[0], uenv.vehs[i].middlePoint[1] ] for i in range(uenv.N)] ] done = done.any() if isinstance(done, np.ndarray) else done if done: print('episode_rew={}'.format(episode_rew)) episode_rew = 0 obs = env.reset() break envcollect.append(env_temp) obscollect.append(obs_origin[0].tolist()) rewcollect.append(rew[0].tolist()) actcollect.append(actions[0].tolist()) with open('envlist.pkl', 'wb') as f: pickle.dump(envcollect, f) obsfile = pd.DataFrame(data=obscollect) obsfile.to_csv('obsfile.csv', encoding='gbk') rewfile = pd.DataFrame(data=rewcollect) rewfile.to_csv('rewfile.csv', encoding='gbk') actfile = pd.DataFrame(data=actcollect) actfile.to_csv('actfile.csv', encoding='gbk') # with open("obs.csv", "w", newline='') as obsfile: # writer1 = csv.writer(obsfile) # writer1.writerows(obscollect) # for row in obscollect: # print(type(row)) # writer1.writerows(row) # with open("rew.csv", "wb", newline='') as rewfile: # writer2 = csv.writer(rewfile) # for row in rewcollect: # writer2.writerows(row) # with open("act.csv", "wb", newline='') as actfile: # writer3 = csv.writer(actfile) # for row in actcollect: # writer3.writerows(row) env.close() return model
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training train_model = policy(nbatch_train, nsteps, sess) # CREATE THE PLACEHOLDERS A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) # Keep track of old actor OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) # Cliprange CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value # Get the value predicted vpred = train_model.vf # Clip the value = Oldvalue + clip(value - oldvalue, min = - cliprange, max = cliprange) vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) # Value loss 0.5 * SUM [max(unclipped, clipped) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Remember we want ratio (pi current policy / pi old policy) # But neglopac returns us -log(policy) # So we want to transform it into ratio # e^(-log old - (-log new)) == e^(log new - log old) == e^(log(new / old)) # = new/old (since exponential function cancels log) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Remember also that we're doing gradient ascent, aka we want to MAXIMIZE the objective function which is equivalent to say # Loss = - J # To make objective function negative we can put a negation on the multiplication (pi new / pi old) * - Advantages pg_losses = -ADV * ratio # value, min [1 - e] , max [1 + e] pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss # Why maximum, because pg_loss_unclipped and pg_loss_clipped are negative, getting the min of positive elements = getting # the max of negative elements pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss (Remember that L = - J because it's the same thing than max J loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 4. Backpropagation _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advs = returns - values # Normalize the advantages advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr, CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map )[:-1] self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") sync_from_root(sess, global_variables) #pylint: disable=E1101
def learn( *, network, env, eval_env, make_eval_env, env_id, total_timesteps, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, sil_update=10, sil_value=0.01, sil_alpha=0.6, sil_beta=0.1, sil_loss=0.1, # MBL # For train mbl mbl_train_freq=5, # For eval num_eval_episodes=5, eval_freq=5, vis_eval=False, eval_targs=('mbmf', ), #eval_targs=('mf',), quant=2, # For mbl.step #num_samples=(1500,), num_samples=(1, ), horizon=(2, ), #horizon=(2,1), #num_elites=(10,), num_elites=(1, ), mbl_lamb=(1.0, ), mbl_gamma=0.99, #mbl_sh=1, # Number of step for stochastic sampling mbl_sh=10000, #vf_lookahead=-1, #use_max_vf=False, reset_per_step=(0, ), # For get_model num_fc=2, num_fwd_hidden=500, use_layer_norm=False, # For MBL num_warm_start=int(1e4), init_epochs=10, update_epochs=5, batch_size=512, update_with_validation=False, use_mean_elites=1, use_ent_adjust=0, adj_std_scale=0.5, # For data loading validation_set_path=None, # For data collect collect_val_data=False, # For traj collect traj_collect='mf', # For profile measure_time=True, eval_val_err=False, measure_rew=True, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' if not isinstance(num_samples, tuple): num_samples = (num_samples, ) if not isinstance(horizon, tuple): horizon = (horizon, ) if not isinstance(num_elites, tuple): num_elites = (num_elites, ) if not isinstance(mbl_lamb, tuple): mbl_lamb = (mbl_lamb, ) if not isinstance(reset_per_step, tuple): reset_per_step = (reset_per_step, ) if validation_set_path is None: if collect_val_data: validation_set_path = os.path.join(logger.get_dir(), 'val.pkl') else: validation_set_path = os.path.join('dataset', '{}-val.pkl'.format(env_id)) if eval_val_err: eval_val_err_path = os.path.join('dataset', '{}-combine-val.pkl'.format(env_id)) logger.log(locals()) logger.log('MBL_SH', mbl_sh) logger.log('Traj_collect', traj_collect) if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) np.set_printoptions(precision=3) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: model_fn = Model make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, sil_update=sil_update, fn_reward=None, fn_obs=None, sil_value=sil_value, sil_alpha=sil_alpha, sil_beta=sil_beta, sil_loss=sil_loss, comm=comm, mpi_rank_weight=mpi_rank_weight, ppo=True, prev_pi=None) model = make_model() pi = model.sil_model if load_path is not None: model.load(load_path) # MBL # --------------------------------------- #viz = Visdom(env=env_id) win = None eval_targs = list(eval_targs) logger.log(eval_targs) make_model_f = get_make_mlp_model(num_fc=num_fc, num_fwd_hidden=num_fwd_hidden, layer_norm=use_layer_norm) mbl = MBL(env=eval_env, env_id=env_id, make_model=make_model_f, num_warm_start=num_warm_start, init_epochs=init_epochs, update_epochs=update_epochs, batch_size=batch_size, **network_kwargs) val_dataset = {'ob': None, 'ac': None, 'ob_next': None} if update_with_validation: logger.log('Update with validation') val_dataset = load_val_data(validation_set_path) if eval_val_err: logger.log('Log val error') eval_val_dataset = load_val_data(eval_val_err_path) if collect_val_data: logger.log('Collect validation data') val_dataset_collect = [] def _mf_pi(ob, t=None): stochastic = True ac, vpred, _, _ = pi.step(ob, stochastic=stochastic) return ac, vpred def _mf_det_pi(ob, t=None): #ac, vpred, _, _ = pi.step(ob, stochastic=False) ac, vpred = pi._evaluate([pi.pd.mode(), pi.vf], ob) return ac, vpred def _mf_ent_pi(ob, t=None): mean, std, vpred = pi._evaluate([pi.pd.mode(), pi.pd.std, pi.vf], ob) ac = np.random.normal(mean, std * adj_std_scale, size=mean.shape) return ac, vpred ################### use_ent_adjust======> adj_std_scale????????pi action sample def _mbmf_inner_pi(ob, t=0): if use_ent_adjust: return _mf_ent_pi(ob) else: #return _mf_pi(ob) if t < mbl_sh: return _mf_pi(ob) else: return _mf_det_pi(ob) # --------------------------------------- # Run multiple configuration once all_eval_descs = [] def make_mbmf_pi(n, h, e, l): def _mbmf_pi(ob): ac, rew = mbl.step(ob=ob, pi=_mbmf_inner_pi, horizon=h, num_samples=n, num_elites=e, gamma=mbl_gamma, lamb=l, use_mean_elites=use_mean_elites) return ac[None], rew return Policy(step=_mbmf_pi, reset=None) for n in num_samples: for h in horizon: for l in mbl_lamb: for e in num_elites: if 'mbmf' in eval_targs: all_eval_descs.append(('MeanRew', 'MBL_PPO_SIL', make_mbmf_pi(n, h, e, l))) #if 'mbmf' in eval_targs: all_eval_descs.append(('MeanRew-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), 'MBL_TRPO-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), make_mbmf_pi(n, h, e, l))) if 'mf' in eval_targs: all_eval_descs.append( ('MeanRew', 'PPO_SIL', Policy(step=_mf_pi, reset=None))) logger.log('List of evaluation targets') for it in all_eval_descs: logger.log(it[0]) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield pool = Pool(mp.cpu_count()) warm_start_done = False U.initialize() if load_path is not None: pi.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=40) if init_fn is not None: init_fn() if traj_collect == 'mf': obs = runner.run()[0] # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer if hasattr(model.train_model, "ret_rms"): model.train_model.ret_rms.update(returns) if hasattr(model.train_model, "rms"): model.train_model.rms.update(obs) tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 # Val data collection if collect_val_data: for ob_, ac_, ob_next_ in zip(obs[:-1, 0, ...], actions[:-1, ...], obs[1:, 0, ...]): val_dataset_collect.append( (copy.copy(ob_), copy.copy(ac_), copy.copy(ob_next_))) # ----------------------------- # MBL update else: ob_mbl, ac_mbl = obs.copy(), actions.copy() mbl.add_data_batch(ob_mbl[:-1, ...], ac_mbl[:-1, ...], ob_mbl[1:, ...]) mbl.update_forward_dynamic(require_update=(update - 1) % mbl_train_freq == 0, ob_val=val_dataset['ob'], ac_val=val_dataset['ac'], ob_next_val=val_dataset['ob_next']) # ----------------------------- if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) l_loss, sil_adv, sil_samples, sil_nlogp = model.sil_train(lrnow) else: # recurrent version print("caole") assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("misc/serial_timesteps", update * nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv("AverageReturn", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv('loss/' + lossname, lossval) if sil_update > 0: logger.logkv("sil_samples", sil_samples) if rank == 0: # MBL evaluation if not collect_val_data: #set_global_seeds(seed) default_sess = tf.get_default_session() def multithread_eval_policy(env_, pi_, num_episodes_, vis_eval_, seed): with default_sess.as_default(): if hasattr(env, 'ob_rms') and hasattr( env_, 'ob_rms'): env_.ob_rms = env.ob_rms res = eval_policy(env_, pi_, num_episodes_, vis_eval_, seed, measure_time, measure_rew) try: env_.close() except: pass return res if mbl.is_warm_start_done() and update % eval_freq == 0: warm_start_done = mbl.is_warm_start_done() if num_eval_episodes > 0: targs_names = {} with timed('eval'): num_descs = len(all_eval_descs) list_field_names = [ e[0] for e in all_eval_descs ] list_legend_names = [ e[1] for e in all_eval_descs ] list_pis = [e[2] for e in all_eval_descs] list_eval_envs = [ make_eval_env() for _ in range(num_descs) ] list_seed = [seed for _ in range(num_descs)] list_num_eval_episodes = [ num_eval_episodes for _ in range(num_descs) ] print(list_field_names) print(list_legend_names) list_vis_eval = [ vis_eval for _ in range(num_descs) ] for i in range(num_descs): field_name, legend_name = list_field_names[ i], list_legend_names[i], res = multithread_eval_policy( list_eval_envs[i], list_pis[i], list_num_eval_episodes[i], list_vis_eval[i], seed) #eval_results = pool.starmap(multithread_eval_policy, zip(list_eval_envs, list_pis, list_num_eval_episodes, list_vis_eval,list_seed)) #for field_name, legend_name, res in zip(list_field_names, list_legend_names, eval_results): perf, elapsed_time, eval_rew = res logger.logkv(field_name, perf) if measure_time: logger.logkv('Time-%s' % (field_name), elapsed_time) if measure_rew: logger.logkv( 'SimRew-%s' % (field_name), eval_rew) targs_names[field_name] = legend_name if eval_val_err: fwd_dynamics_err = mbl.eval_forward_dynamic( obs=eval_val_dataset['ob'], acs=eval_val_dataset['ac'], obs_next=eval_val_dataset['ob_next']) logger.logkv('FwdValError', fwd_dynamics_err) #logger.dump_tabular() logger.dumpkvs() #print(logger.get_dir()) #print(targs_names) #if num_eval_episodes > 0: # win = plot(viz, win, logger.get_dir(), targs_names=targs_names, quant=quant, opt='best') #else: logger.dumpkvs() # ----------- yield pi if collect_val_data: with open(validation_set_path, 'wb') as f: pickle.dump(val_dataset_collect, f) logger.log('Save {} validation data'.format( len(val_dataset_collect))) if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) return model
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta): if USING_TPUS: tpu_grpc_url = TPUClusterResolver( tpu=[os.environ['TPU_NAME']]).get_master() sess = tf.Session(tpu_grpc_url) sess.run(tpu.initialize_system()) else: sess = get_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) # actions D = tf.placeholder(tf.float32, [nbatch]) # dones R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's LR = tf.placeholder(tf.float32, []) eps = 1e-6 step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs, ) + ob_space.shape) train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs * (nsteps + 1), ) + ob_space.shape) with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE): step_model = policy(observ_placeholder=step_ob_placeholder, sess=sess) train_model = policy(observ_placeholder=train_ob_placeholder, sess=sess) params = find_trainable_variables("acer_model") print("Params {}".format(len(params))) for var in params: print(var) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) print(v.name) return v with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True): polyak_model = policy(observ_placeholder=train_ob_placeholder, sess=sess) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # action probability distributions according to train_model, polyak_model and step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(train_model.pi) polyak_model_p = tf.nn.softmax(polyak_model.pi) step_model_p = tf.nn.softmax(step_model.pi) v = tf.reduce_sum(train_model_p * train_model.q, axis=-1) # shape is [nenvs * (nsteps + 1)] # strip off last step f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, A) q_i = get_by_index(q, A) # Compute ratios for importance truncation rho = f / (MU + eps) rho_i = get_by_index(rho, A) # Calculate Q_retrace targets qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient( adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]) ) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2) gain_bc = tf.reduce_sum( logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis=1) #IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]] * 2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = -f_pol / ( f + eps ) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g / ( nenvs * nsteps ) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) grads = [ gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params) ] avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging run_ops = [ _train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ] names_ops = [ 'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: run_ops = run_ops + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] names_ops = names_ops + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] def train(obs, actions, rewards, dones, mus, states, masks, steps): cur_lr = lr.value_steps(steps) td_map = { train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks td_map[polyak_model.S] = states td_map[polyak_model.M] = masks if USING_TPUS: return names_ops, sess.run(tpu.rewrite( run_ops, td_map))[1:] # strip off _train else: return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train def _step(observation, **kwargs): return step_model._evaluate( [step_model.action, step_model_p, step_model.state], observation, **kwargs) self.train = train self.save = functools.partial(save_variables, sess=sess, variables=params) self.train_model = train_model self.step_model = step_model self._step = _step self.step = self.step_model.step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear', is_async=True): self.sess = sess = get_session() nbatch = nenvs * nsteps with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE): self.model = step_model = policy(nenvs, 1, sess=sess) self.model2 = train_model = policy(nenvs * nsteps, nsteps, sess=sess) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) self.logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV * neglogpac) entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) self.vf_fisher = vf_fisher_loss = - vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("acktr_model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, \ momentum=0.9, kfac_update=1, epsilon=0.01, \ stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm) # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = {train_model.X: obs, A: actions, ADV: advs, R: rewards, PG_LR: cur_lr, VF_LR: cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map ) return policy_loss, value_loss, policy_entropy self.train = train self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
from baselines.ppo2 import ppo2 from baselines.common.vec_env import VecFrameStack, VecNormalize, VecEnv, ShmemVecEnv, SubprocVecEnv, DummyVecEnv from baselines.common.vec_env.vec_video_recorder import VecVideoRecorder from baselines.common.cmd_util import common_arg_parser, parse_unknown_args, make_vec_env, make_env, make_mujoco_env from baselines.common.tf_util import get_session try: from mpi4py import MPI except ImportError: MPI = None config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) def get_alg_module(alg, submodule=None): submodule = submodule or alg try: # first try to import the alg module from baselines alg_module = import_module('.'.join(['baselines', alg, submodule])) except ImportError: # then from rl_algs alg_module = import_module('.'.join(['rl_' + 'algs', alg, submodule])) return alg_module def get_learn_function(alg, submodule=None):
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = args.seed env_type, env_id = get_env_type(args.env) if env_type == 'mujoco': get_session( tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) if args.num_env: env = SubprocVecEnv([ lambda: make_mujoco_env(env_id, seed + i if seed is not None else None, args.reward_scale) for i in range(args.num_env) ]) else: env = DummyVecEnv( [lambda: make_mujoco_env(env_id, seed, args.reward_scale)]) env = VecNormalize(env) elif env_type == 'atari': if alg == 'acer': env = make_atari_env(env_id, nenv, seed) elif alg == 'deepq': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor(env, logger.get_dir()) env = atari_wrappers.wrap_deepmind(env, frame_stack=True, scale=True) elif alg == 'trpo_mpi': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env = atari_wrappers.wrap_deepmind(env) # TODO check if the second seeding is necessary, and eventually remove env.seed(seed) else: frame_stack_size = 4 env = VecFrameStack(make_atari_env(env_id, nenv, seed), frame_stack_size) elif env_type == 'retro': import retro gamestate = args.gamestate or 'Level1-1' env = retro_wrappers.make_retro( game=args.env, state=gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE) env.seed(args.seed) env = bench.Monitor(env, logger.get_dir()) env = retro_wrappers.wrap_deepmind_retro(env) elif env_type == 'classic_control': def make_env(): e = gym.make(env_id) e = bench.Monitor(e, logger.get_dir(), allow_early_resets=True) e.seed(seed) return e env = DummyVecEnv([make_env]) else: raise ValueError('Unknown env_type {}'.format(env_type)) return env
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): act_model = policy(nbatch_act, 1, sess) train_model = policy(nbatch_train, nsteps, sess) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = tf.trainable_variables('ppo2_model') trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): advs = returns - values advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1] self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") sync_from_root(sess, global_variables) #pylint: disable=E1101
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) init_policy_params=None): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() if init_policy_params is not None: cur_scope = pi.get_variables()[0].name[0:pi.get_variables()[0].name. find('/')] orig_scope = list(init_policy_params.keys() )[0][0:list(init_policy_params.keys())[0].find('/')] for i in range(len(pi.get_variables())): assign_op = pi.get_variables()[i].assign( init_policy_params[pi.get_variables()[i].name.replace( cur_scope, orig_scope, 1)]) U.get_session().run(assign_op) assign_op = oldpi.get_variables()[i].assign( init_policy_params[pi.get_variables()[i].name.replace( cur_scope, orig_scope, 1)]) U.get_session().run(assign_op) adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()
def learn( env, seed=None, total_timesteps=1e6, nb_epochs=None, # with default settings, perform 1M steps total nb_rollout_steps=100, max_ep_len=250, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, start_steps=10000, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, nb_log_steps=None, nb_save_steps=None, batch_size=64, # per MPI worker polyak=0.01, action_range=(-250.0, 250.0), observation_range=(-5.0, 5.0), target_noise=0.2, noise_clip=0.5, policy_delay=2, eval_env=None, load_path=None, save_dir=None, **network_kwargs): set_global_seeds(seed) if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 memory = Memory(limit=int(1e6)) network_spec = [{ 'layer_type': 'dense', 'units': int(256), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(1), 'activation': 'tanh', 'nodes_in': ['main'], 'nodes_out': ['main'] }] vnetwork_spec = [{ 'layer_type': 'concat', 'nodes_in': ['action_movement', 'observation_self'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(256), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int(1), 'activation': '', 'nodes_in': ['main'], 'nodes_out': ['main'] }] network = Td3Policy(scope="td3", ob_space=env.observation_space, ac_space=env.action_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=False, reuse=False, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns, observation_range=observation_range, action_range=action_range, target_noise=target_noise, noise_clip=noise_clip) target_network = Td3Policy(scope="target", ob_space=env.observation_space, ac_space=env.action_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=False, reuse=False, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=False, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns, observation_range=observation_range, action_range=action_range, target_noise=target_noise, noise_clip=noise_clip, isTarget=True) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: action_noise = dict() for k, v in env.action_space.spaces.items(): act_size = v.spaces[0].shape[-1] _, stddev = current_noise_type.split('_') action_noise[k] = NormalActionNoise(mu=np.zeros(act_size), sigma=float(stddev) * np.ones(act_size)) elif 'ou' in current_noise_type: action_noise = dict() for k, v in env.action_space.spaces.items(): act_size = v.spaces[0].shape[-1] _, stddev = current_noise_type.split('_') action_noise[k] = OrnsteinUhlenbeckActionNoise( mu=np.zeros(act_size), sigma=float(stddev) * np.ones(act_size)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) max_action = action_range[1] logger.info( 'scaling actions by {} before executing in env'.format(max_action)) agent = TD3(env, network, target_network, memory, env.action_space, env.observation_space, steps_per_epoch=nb_rollout_steps, epochs=nb_epochs, gamma=gamma, polyak=polyak, actor_lr=actor_lr, critic_lr=critic_lr, batch_size=batch_size, start_steps=start_steps, action_noise=action_noise, target_noise=target_noise, noise_clip=noise_clip, policy_delay=policy_delay) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() saver = functools.partial(save_variables, sess=sess) loader = functools.partial(load_variables, sess=sess) if load_path != None: loader(load_path) # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = env.num_envs n_agents = obs['observation_self'].shape[0] episode_reward = np.zeros((nenvs, n_agents), dtype=np.float32) #vector episode_step = np.zeros(nenvs, dtype=int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for t in range(int(total_timesteps)): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy (with some noise, via act_noise). """ if t > start_steps: action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) nenvs_actions = [] for i in range(nenvs): nenv_action = { 'action_movement': action['action_movement'][i * n_agents:(i + 1) * n_agents] } nenvs_actions.append(nenv_action) else: action, q = env.action_space.sample(), None nenvs_actions = [] for i in range(nenvs): nenv_action = { 'action_movement': action['action_movement'][i * n_agents:(i + 1) * n_agents][0] } nenvs_actions.append(nenv_action) new_obs, r, done, info = env.step(nenvs_actions) episode_reward += r episode_step += 1 for d in range(len(done)): done[d] = False if episode_step == max_ep_len else done[d] epoch_actions.append(action) epoch_qs.append(q) agent.store_transition( obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() episode_actor_losses = [] episode_critic_losses = [] episode_critic = [] episode_critic_twin = [] if d or (episode_step[0] == max_ep_len): """ Perform all TD3 updates at the end of the trajectory (in accordance with source code of TD3 published by original authors). """ for j in range(episode_step[0]): critic_loss, critic, critic_twin, actor_loss = agent.train( episode_step[0]) episode_critic_losses.append(critic_loss) episode_critic.append(critic) episode_critic_twin.append(critic_twin) if actor_loss is not None: episode_actor_losses.append(actor_loss) obs, r, done, episode_reward, episode_step = env.reset( ), 0, False, np.zeros((nenvs, n_agents), dtype=np.float32), np.zeros(nenvs, dtype=int) if (t + 1) % nb_log_steps == 0: # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_std'] = np.std( epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/return_history_std'] = np.std( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['train/loss_actor'] = np.mean(episode_actor_losses) combined_stats['train/loss_critic'] = np.mean( episode_critic_losses) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float( duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s' % x) combined_stats_sums = np.array( [np.array(x).flatten()[0] for x in combined_stats.values()]) # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) if nb_save_steps != None and (t + 1) % nb_save_steps == 0: if save_dir == None: checkdir = osp.join(logger.get_dir(), 'checkpoints') else: checkdir = osp.join(save_dir, 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % t) print('Saving to', savepath) saver(savepath) return agent
# for seed in [1, 2, 3]: for vn in vae_names: vae_params = { 'init_from': vn.replace('/', ':'), 'k': k, 'latent_dim': 5, 'with_attrs': True, } args = [ '--env', 'PendulumVisual-v0', '--num_timesteps', '10e6', '--alg', 'ppo2', '--network', 'mlp', '--log_interval', '2', '--nminibatches', '32', '--noptepochs', '10', '--num_env', '16', '--seed', str(0), '--tensorboard', 'True', '--k', str(k), '--target_kl', '0.01', '--early_stop', 'True', '--plot_thetas', 'True', ] main(args, build_fn=build_pend_env, vae_params=vae_params) s = get_session() s.close() tf.reset_default_graph() print('done')
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None): self.sess = sess = get_session() # add a summary writer - Daniel logdir = logger.get_dir() self.writer = tf.summary.FileWriter(logdir, sess.graph) with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.pretrain_loss = tf.losses.mean_squared_error( train_model.pi, self.A) self.pretrain_op = self.trainer.minimize(self.pretrain_loss, None, params) self.summaries = tf.summary.merge_all() self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101 self.writer.add_graph(sess.graph) self.writer.close()
def learn(env, network, seed=None, lr=5e-4, total_timesteps=1000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, save_path=None, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. batch_size: int size of a batch sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ logger = logging.getLogger() coloredlogs.install( level='DEBUG', fmt= '%(asctime)s,%(msecs)03d %(filename)s[%(process)d] %(levelname)s %(message)s' ) logger.setLevel(logging.DEBUG) # DATAVAULT: Set up list of action meanings and two lists to store episode # and total sums for each possible action in the list. action_names = env.unwrapped.get_action_meanings() action_episode_sums = [] action_total_sums = [] for x in range(len(action_names)): action_episode_sums.append(0) action_total_sums.append(0) # And obviously, you need a datavault item dv = DataVault() # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) #DATAVAULT: This is where you usually want to scrape data - in the timestep loop for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # if environment is pacman, limit moves to four directions name = env.unwrapped.spec.id if name == "MsPacmanNoFrameskip-v4": while True: step_return = act(np.array(obs)[None], update_eps=update_eps, **kwargs) action = step_return[0][0] env_action = action q_values = np.squeeze(step_return[1]) # test for break condition if 1 <= action <= 4: break else: step_return = act(np.array(obs)[None], update_eps=update_eps, **kwargs) action = step_return[0][0] q_values = np.squeeze(step_return[1]) env_action = action reset = False new_obs, rew, done, info = env.step(env_action) # DATAVAULT: after each step, we push the information out to the datavault lives = env.ale.lives() #store_data(self, action, action_name, action_episode_sums, action_total_sums, reward, done, info, lives, q_values, observation, mean_reward): action_episode_sums, action_total_sums = dv.store_data( action, action_names[action], action_episode_sums, action_total_sums, rew, done, info, lives, q_values, new_obs, saved_mean_reward) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() if (len(episode_rewards[-101:-1]) > 0): mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) else: mean_100ep_reward = 0 num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) dv.make_dataframes() print("Save path is: ") print(save_path) # use parent dir to save data, so we can keep the current folder small and portable directory = os.path.abspath(os.path.join(save_path, os.pardir)) csv_path = os.path.join(directory, 'CSVs') os.mkdir(csv_path) dv.df_to_csv(csv_path) return act
def learn(env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) gae_kstep=None, env_eval=None, saved_model=None, eval_at=50, save_at=50, normalize_atarg=True, experiment_spec=None, # dict with: experiment_name, experiment_folder **extra_args ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule entromult = tf.placeholder(name='entromult', dtype=tf.float32, shape=[]) # entropy penalty multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent MPI_n_ranks = MPI.COMM_WORLD.Get_size() ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult, entromult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, entromult], losses) acts = list() stats1 = list() stats2 = list() stats3 = list() for p in range(ac_space.shape[0]): acts.append(tf.placeholder(tf.float32, name="act_{}".format(p + 1))) stats1.append(tf.placeholder(tf.float32, name="stats1_{}".format(p + 1))) stats2.append(tf.placeholder(tf.float32, name="stats2_{}".format(p + 1))) stats3.append(tf.placeholder(tf.float32, name="stats3_{}".format(p + 1))) tf.summary.histogram("act_{}".format(p), acts[p]) if pi.dist == 'gaussian': tf.summary.histogram("pd_mean_{}".format(p), stats1[p]) tf.summary.histogram("pd_std_{}".format(p), stats2[p]) tf.summary.histogram("pd_logstd_{}".format(p), stats3[p]) else: tf.summary.histogram("pd_beta_{}".format(p), stats1[p]) tf.summary.histogram("pd_alpha_{}".format(p), stats2[p]) tf.summary.histogram("pd_alpha_beta_{}".format(p), stats3[p]) rew = tf.placeholder(tf.float32, name="rew") tf.summary.histogram("rew", rew) summaries = tf.summary.merge_all() gather_summaries = U.function([ob, *acts, *stats1, *stats2, *stats3, rew], summaries) U.initialize() adam.sync() if saved_model is not None: U.load_state(saved_model) if (MPI.COMM_WORLD.Get_rank() == 0) & (experiment_spec is not None): # TensorBoard & Saver # ---------------------------------------- if experiment_spec['experiment_folder'] is not None: path_tb = os.path.join(experiment_spec['experiment_folder'], 'tensorboard') path_logs = os.path.join(experiment_spec['experiment_folder'], 'logs') exp_name = '' if experiment_spec['experiment_name'] is not None else experiment_spec['experiment_name'] summary_file = tf.summary.FileWriter(os.path.join(path_tb, exp_name), U.get_session().graph) saver = tf.train.Saver(max_to_keep=None) logger.configure(dir=os.path.join(path_logs, exp_name)) else: logger.configure(format_strs=[]) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(iters_so_far) / max_iters, 0) elif 'exp' in schedule: current_lr = schedule.strip() _, d = current_lr.split('__') cur_lrmult = float(d) ** (float(iters_so_far) / max_iters) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() if gae_kstep is None: add_vtarg_and_adv(seg, gamma, lam) T = len(seg["rew"]) else: calculate_advantage_and_vtarg(seg, gamma, lam, k_step=gae_kstep) T = len(seg["rew"]) - gae_kstep ob, ac, atarg, tdlamret = seg["ob"][:T], seg["ac"][:T], seg["adv"][:T], seg["tdlamret"][:T] vpredbefore = seg["vpred"][:T] # predicted value function before udpate if normalize_atarg: eps = 1e-9 atarg = (atarg - atarg.mean()) / (atarg.std() + eps) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data g_max = -np.Inf g_min = np.Inf g_mean = [] for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, cur_lrmult) g_max = g.max() if g.max() > g_max else g_max g_min = g.min() if g.min() < g_min else g_min g_mean.append(g.mean()) if np.isnan(np.sum(g)): print('NaN in Gradient, skipping this update') continue adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) # logger.log(fmt_row(13, np.mean(losses, axis=0))) summary = tf.Summary() logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) if MPI.COMM_WORLD.Get_rank() == 0: logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) summary.value.add(tag="loss_" + name, simple_value=lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("ItersSoFar (%)", iters_so_far / max_iters * 100) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("TimePerIter", (time.time() - tstart) / (iters_so_far + 1)) if MPI.COMM_WORLD.Get_rank() == 0: # Saves model if ((iters_so_far % save_at) == 0) & (iters_so_far != 0): if experiment_spec['experiment_folder'] is not None: path_models = os.path.join(experiment_spec['experiment_folder'], 'models') dir_path = os.path.join(path_models, exp_name) if not os.path.exists(dir_path): os.makedirs(dir_path) saver.save(U.get_session(), os.path.join(dir_path, 'model'), global_step=iters_so_far) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() summ = gather_summaries(ob, *np.split(ac, ac_space.shape[0], axis=1), *np.split(seg['stat1'], ac_space.shape[0], axis=1), *np.split(seg['stat2'], ac_space.shape[0], axis=1), *np.split(seg['stat3'], ac_space.shape[0], axis=1), seg['rew']) summary.value.add(tag="total_loss", simple_value=meanlosses[:3].sum()) summary.value.add(tag="explained_variance", simple_value=explained_variance(vpredbefore, tdlamret)) summary.value.add(tag='EpRewMean', simple_value=np.mean(rewbuffer)) summary.value.add(tag='EpLenMean', simple_value=np.mean(lenbuffer)) summary.value.add(tag='EpThisIter', simple_value=len(lens)) summary.value.add(tag='atarg_max', simple_value=atarg.max()) summary.value.add(tag='atarg_min', simple_value=atarg.min()) summary.value.add(tag='atarg_mean', simple_value=atarg.mean()) summary.value.add(tag='GMean', simple_value=np.mean(g_mean)) summary.value.add(tag='GMax', simple_value=g_max) summary.value.add(tag='GMin', simple_value=g_min) summary.value.add(tag='learning_rate', simple_value=cur_lrmult * optim_stepsize) summary.value.add(tag='AcMAX', simple_value=np.mean(seg["ac"].max())) summary.value.add(tag='AcMIN', simple_value=np.mean(seg["ac"].min())) summary_file.add_summary(summary, iters_so_far) summary_file.add_summary(summ, iters_so_far) iters_so_far += 1
def _serialize_variables(): sess = get_session() variables = tf.trainable_variables() values = sess.run(variables) return {var.name: value for var, value in zip(variables, values)}
def _create_double_network(self, reuse=False): # logger.info("Creating a q function ensemble with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # running averages, separate from alg (this is within a different scope) # assume reuse is False with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats'): if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) self.V_loss_tf = [None] * self.size_ensemble self.V_fun = [None] * self.size_ensemble self.V_target_fun = [None] * self.size_ensemble self.V_grads_vars_tf = [None] * self.size_ensemble self.V_grad_tf = [None] * self.size_ensemble self.V_adam = [None] * self.size_ensemble self.init_target_net_op = [None] * self.size_ensemble self.update_target_net_op = [None] * self.size_ensemble clip_range = (-self.clip_return, 0. if self.clip_pos_returns else self.clip_return) for e in range(self.size_ensemble): # mini-batch sampling batch = self.staging_tf[e].get() batch_tf = OrderedDict([ (key, batch[i]) for i, key in enumerate(self.stage_shapes.keys()) ]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) # networks (no target network for now) with tf.variable_scope(f've_{e}') as vs: if reuse: vs.reuse_variables() v_function = self.create_v_function(batch_tf, **self.__dict__) vs.reuse_variables() with tf.variable_scope(f've_{e}_target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] target_batch_tf['u'] = batch_tf['u_2'] v_target_function = self.create_v_function( target_batch_tf, **self.__dict__) vs.reuse_variables() # loss functions target_tf = tf.clip_by_value( batch_tf['r'] + self.gamma * v_target_function.V_tf, *clip_range) V_loss_tf = tf.reduce_mean( tf.square(tf.stop_gradient(target_tf) - v_function.V_tf)) V_scope = f've_{e}/V' V_grads_tf = tf.gradients(V_loss_tf, self._vars(V_scope)) assert len(self._vars(V_scope)) == len(V_grads_tf) V_grads_vars_tf = zip(V_grads_tf, self._vars(V_scope)) V_grad_tf = flatten_grads(grads=V_grads_tf, var_list=self._vars(V_scope)) # optimizers V_adam = MpiAdam(self._vars(V_scope), scale_grad_by_procs=False) # store in attribute lists self.V_loss_tf[e] = V_loss_tf self.V_fun[e] = v_function self.V_target_fun[e] = v_target_function self.V_grads_vars_tf[e] = V_grads_vars_tf self.V_grad_tf[e] = V_grad_tf self.V_adam[e] = V_adam # polyak averaging main_vars = sum( [self._vars(f've_{e}/V') for e in range(self.size_ensemble)], []) target_vars = sum([ self._vars(f've_{e}_target/V') for e in range(self.size_ensemble) ], []) self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(target_vars, main_vars))) self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(target_vars, main_vars))) assert len(main_vars) == len(target_vars) # report loss as the average of value function loss over the ensemble # self.V_loss_tf = tf.reduce_mean(self.V_loss_tf) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None): self.sess = sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def learn(*, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs ): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() cpus_per_worker = 1 U.get_session(config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker )) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************"%iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank==0: logger.dump_tabular() return pi
def update_popart(self, v_targets): old_mean, old_std = U.get_session().run( [self.v_rms.mean, self.v_rms.std]) self.v_rms.update(v_targets) self.renorm_v(old_std, old_mean)
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([(key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) #choose only the demo buffer samples mask = np.concatenate((np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis = 0) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic( target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) if self.bc_loss ==1 and self.q_filter == 1 : # train with demonstrations and use bc_loss and q_filter both maskMain = tf.reshape(tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1]) #where is the demonstrator action better than actor action according to the critic? choose those samples only #define the cloning loss on the actor's actions only on the samples which adhere to the above masks self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) #primary loss scaled by it's respective weight prm_loss_weight self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) #L2 loss on action values scaled by the same weight prm_loss_weight self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf else: #If not training with demonstrations self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def learn(network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, **network_kwargs): set_global_seeds(seed) if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 nb_actions = env.action_space.shape[-1] assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype = np.float32) #vector episode_step = np.zeros(nenvs, dtype = int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # Execute next action. if rank == 0 and render: env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # note these outputs are batched from vecenv t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append(eval_episode_reward[d]) eval_episode_reward[d] = 0.0 if MPI is not None: mpi_size = MPI.COMM_WORLD.Get_size() else: mpi_size = 1 # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = np.array([ np.array(x).flatten()[0] for x in combined_stats.values()]) if MPI is not None: combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) return agent
def __init__(self, policy, env, nsteps, model_name="a2c_model", ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs * nsteps with tf.variable_scope(model_name, reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(None, 1, sess) # train_model is used to train our network train_model = policy(None, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, (None, )) R = tf.placeholder(tf.float32, (None, )) LR = tf.placeholder(tf.float32, []) # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Value loss # vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) vf_loss = losses.mean_squared_error(train_model.vf, R) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables(model_name) # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs ): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_variables(model_file) return act
def learn( env, test_env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint entcoeff=0.0, vf_coef=0.5, callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) save_interval=50, #load_path = "C:\\Users\\Yangang REN\\AppData\\Local\\Temp\\openai-2019-11-21-10-40-10-039590\\checkpoints\\00351" load_path=None): """ :param env: :param test_env: :param policy_func: :param timesteps_per_batch: :param clip_param: :param optim_epochs: :param optim_stepsize: :param optim_batchsize: :param gamma: :param lam: :param max_timesteps: :param max_episodes: :param max_iters: :param max_seconds: :param entcoeff: :param vf_coef: float value function loss coefficient in the optimization objective :param callback: :param adam_epsilon: :param schedule: :param save_interval: :param load_path: :return: """ assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" rew_mean = [] # get state and action space ob_space = env.observation_space pro_ac_space = env.action_space adv_ac_space = env.adv_action_space # Construct network for new policy pro_pi = policy_func("pro_pi", ob_space, pro_ac_space) pro_oldpi = policy_func("pro_oldpi", ob_space, pro_ac_space) adv_pi = policy_func("adv_pi", ob_space, adv_ac_space) adv_oldpi = policy_func("adv_oldpi", ob_space, adv_ac_space) pro_atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) adv_atarg = tf.placeholder(dtype=tf.float32, shape=[None]) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule # Annealed cliping parameter epislon clip_param = clip_param * lrmult ob = U.get_placeholder_cached(name="ob") pro_ac = pro_pi.pdtype.sample_placeholder([None]) adv_ac = adv_pi.pdtype.sample_placeholder([None]) pro_kloldnew = pro_oldpi.pd.kl(pro_pi.pd) # compute kl difference adv_kloldnew = adv_oldpi.pd.kl(adv_pi.pd) pro_ent = pro_pi.pd.entropy() adv_ent = adv_pi.pd.entropy() pro_meankl = tf.reduce_mean(pro_kloldnew) adv_meankl = tf.reduce_mean(adv_kloldnew) pro_meanent = tf.reduce_mean(pro_ent) adv_meanent = tf.reduce_mean(adv_ent) pro_pol_entpen = (-entcoeff) * pro_meanent adv_pol_entpen = (-entcoeff) * adv_meanent pro_ratio = tf.exp(pro_pi.pd.logp(pro_ac) - pro_oldpi.pd.logp(pro_ac)) adv_ratio = tf.exp(adv_pi.pd.logp(adv_ac) - adv_oldpi.pd.logp(adv_ac)) pro_surr1 = pro_ratio * pro_atarg # surrogate from conservative policy iteration adv_surr1 = adv_ratio * adv_atarg pro_surr2 = tf.clip_by_value(pro_ratio, 1.0 - clip_param, 1.0 + clip_param) * pro_atarg adv_surr2 = tf.clip_by_value(adv_ratio, 1.0 - clip_param, 1.0 + clip_param) * adv_atarg # TODO:check this code carefully pro_pol_surr = -tf.reduce_mean(tf.minimum(pro_surr1, pro_surr2)) adv_pol_surr = tf.reduce_mean(tf.minimum(adv_surr1, adv_surr2)) pro_vf_loss = tf.reduce_mean(tf.square(pro_pi.vpred - ret)) adv_vf_loss = tf.reduce_mean(tf.square(adv_pi.vpred - ret)) # FIXME: do not forget cofficient between different loss pro_total_loss = pro_pol_surr + pro_pol_entpen + vf_coef * pro_vf_loss adv_total_loss = adv_pol_surr + adv_pol_entpen + vf_coef * adv_vf_loss pro_losses = [ pro_pol_surr, pro_pol_entpen, pro_vf_loss, pro_meankl, pro_meanent ] pro_loss_names = [ "pro_pol_surr", "pro_pol_entpen", "pro_vf_loss", "pro_kl", "pro_ent" ] adv_losses = [ adv_pol_surr, adv_pol_entpen, adv_vf_loss, adv_meankl, adv_meanent ] adv_loss_names = [ "adv_pol_surr", "adv_pol_entpen", "adv_vf_loss", "adv_kl", "adv_ent" ] pro_var_list = pro_pi.get_trainable_variables() adv_var_list = adv_pi.get_trainable_variables() pro_lossandgrad = U.function([ob, pro_ac, pro_atarg, ret, lrmult], pro_losses + [U.flatgrad(pro_total_loss, pro_var_list)]) adv_lossandgrad = U.function([ob, adv_ac, adv_atarg, ret, lrmult], adv_losses + [U.flatgrad(adv_total_loss, adv_var_list)]) pro_adam = MpiAdam(pro_var_list, epsilon=adam_epsilon) adv_adam = MpiAdam(adv_var_list, epsilon=adam_epsilon) pro_assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( pro_oldpi.get_variables(), pro_pi.get_variables()) ]) adv_assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame( adv_oldpi.get_variables(), adv_pi.get_variables()) ]) # U.function(inputs, outputs) pro_compute_losses = U.function([ob, pro_ac, pro_atarg, ret, lrmult], pro_losses) adv_compute_losses = U.function([ob, adv_ac, adv_atarg, ret, lrmult], adv_losses) U.initialize() pro_adam.sync() adv_adam.sync() save = functools.partial(save_variables, sess=get_session()) load = functools.partial(load_variables, sess=get_session()) # TODO: load save the path if load_path is not None: load(load_path) print('Loading model and running it…') max_iters = 0 # Prepare for rollouts seg_gen = traj_segment_generator(pro_pi, adv_pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards # Begin to update the loss function for update in range(1, max_iters + 1): if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break # adjusting the learning rate if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = 1.0 - (update - 1.0) / max_iters else: raise NotImplementedError logger.log("********** Iteration %i ************" % (iters_so_far + 1)) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, pro_ac, adv_ac, pro_atarg, adv_atarg, pro_tdlamret, adv_tdlamret = seg[ "ob"], seg["pro_ac"], seg["adv_ac"], seg["pro_adv"], seg[ "adv_adv"], seg["pro_tdlamret"], seg["adv_tdlamret"] pro_vpredbefore = seg[ "pro_vpred"] # predicted value function before udpate adv_vpredbefore = seg["adv_vpred"] # standardized advantage function estimate pro_atarg = (pro_atarg - pro_atarg.mean()) / (pro_atarg.std() + 1e-8) adv_atarg = (adv_atarg - adv_atarg.mean()) / (adv_atarg.std() + 1e-8) # TODO d = Dataset(dict(ob=ob, ac=pro_ac, atarg=pro_atarg, vtarg=pro_tdlamret), shuffle=not pro_pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pro_pi, "ob_rms"): pro_pi.ob_rms.update(ob) # update running mean/std for policy pro_assign_old_eq_new( ) # set old parameter values to new parameter values # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): pro_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = pro_lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) pro_adam.update(g, optim_stepsize * cur_lrmult) pro_losses.append(newlosses) pro_losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = pro_compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) pro_losses.append(newlosses) pro_meanlosses, _, _ = mpi_moments(pro_losses, axis=0) # Training the adversary agent d = Dataset(dict(ob=ob, ac=adv_ac, atarg=adv_atarg, vtarg=adv_tdlamret), shuffle=not adv_pi.recurrent) if hasattr(adv_pi, "ob_rms"): adv_pi.ob_rms.update(ob) adv_assign_old_eq_new() # logger.log(fmt_row(13, adv_loss_names)) for _ in range(optim_epochs): adv_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = adv_lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adv_adam.update(g, optim_stepsize * cur_lrmult) adv_losses.append(newlosses) adv_losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = adv_compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adv_losses.append(newlosses) adv_meanlosses, _, _ = mpi_moments(adv_losses, axis=0) # print the results logger.logkv("pro_policy_vf", pro_meanlosses[2]) logger.logkv("adv_policy_vf", adv_meanlosses[2]) # test # curr_rew = evaluate(pro_pi, test_env) # rew_mean.append(curr_rew) # print(curr_rew) curr_rew = evaluate(pro_pi, adv_pi, test_env) rew_mean.append(curr_rew) logger.logkv("test reward", curr_rew) # logger.record_tabular("ev_tdlam_before", explained_variance(pro_vpredbefore, pro_tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.logkv('eprewmean', safemean(rewbuffer)) logger.logkv('eplenmean', safemean(lenbuffer)) logger.dumpkvs() if save_interval and (update == 1 or iters_so_far % save_interval == 0) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to…', savepath) save(savepath) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 # return np.array(rew_mean) return pro_pi, adv_pi
def learn(network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, load_path = None, save_path = '<specify/path>', **network_kwargs): print("Save PATH;{}".format(save_path)) print("Load PATH;{}".format(load_path)) set_global_seeds(seed) if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 nb_actions = env.action_space.shape[-1] #assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) # Prepare everything. sess = U.get_session() agent.initialize(sess) checkpoint_num = 0 if load_path is not None: agent.load(load_path) checkpoint_num = int(os.path.split(load_path)[1]) + 1 sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype = np.float32) #vector episode_step = np.zeros(nenvs, dtype = int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 if load_path is None: os.makedirs(save_path, exist_ok=True) for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # Execute next action. if rank == 0 and render: env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # note these outputs are batched from vecenv t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append(eval_episode_reward[d]) eval_episode_reward[d] = 0.0 if MPI is not None: mpi_size = MPI.COMM_WORLD.Get_size() else: mpi_size = 1 # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = np.array([ np.array(x).flatten()[0] for x in combined_stats.values()]) if MPI is not None: combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) savepath = os.path.join(save_path, str(epoch+checkpoint_num)) print('Saving to ', savepath) agent.save(savepath) return agent
def learn(*, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters =3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, **network_kwargs ): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session(config=tf.ConfigProto( allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker )) policy = build_policy(env, network, value_network='copy', **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start+sz], shape)) start += sz gvp = tf.add_n([tf.reduce_sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi"))]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters>0, total_timesteps>0, max_episodes>0])==0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************"%iters_so_far) with timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new() # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0) assert np.isfinite(stepdir).all() shs = .5*stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank==0: logger.dump_tabular() return pi
def learn( env, policy_func, *, timesteps_per_batch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) num_options=1, app='', saves=False, wsaves=False, epoch=-1, seed=1, dc=0): optim_batchsize_ideal = optim_batchsize np.random.seed(seed) tf.set_random_seed(seed) env._seed(seed) ### Book-keeping gamename = env.spec.id[:-3].lower() gamename += 'seed' + str(seed) gamename += app dirname = '{}_{}opts_saves/'.format(gamename, num_options) if wsaves: first = True if not os.path.exists(dirname): os.makedirs(dirname) first = False # while os.path.exists(dirname) and first: # dirname += '0' files = ['pposgd_simple.py', 'mlp_policy.py', 'run_mujoco.py'] for i in range(len(files)): src = os.path.expanduser('~/baselines/baselines/ppo1/') + files[i] dest = os.path.expanduser('~/baselines/baselines/ppo1/') + dirname shutil.copy2(src, dest) ### # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return # option = tf.placeholder(dtype=tf.int32, shape=[None]) lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon # pdb.set_trace() ob = U.get_placeholder_cached(name="ob") option = U.get_placeholder_cached(name="option") term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None]) ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = U.mean(kloldnew) meanent = U.mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -U.mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = U.mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] term_loss = pi.tpred * term_adv log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0)) entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1) op_loss = -tf.reduce_sum(log_pi[0][option[0]] * atarg + entropy * 0.1) total_loss += op_loss var_list = pi.get_trainable_variables() term_list = var_list[6:8] lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option, term_adv], losses + [U.flatgrad(total_loss, var_list)]) termloss = U.function([ob, option, term_adv], [U.flatgrad(term_loss, var_list) ]) # Since we will use a different step size. adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses) U.initialize() adam.sync() saver = tf.train.Saver(max_to_keep=10000) ### More book-kepping results = [] if saves: results = open( gamename + '_' + str(num_options) + 'opts_' + '_results.csv', 'w') out = 'epoch,avg_reward' for opt in range(num_options): out += ',option {} dur'.format(opt) for opt in range(num_options): out += ',option {} std'.format(opt) for opt in range(num_options): out += ',option {} term'.format(opt) for opt in range(num_options): out += ',option {} adv'.format(opt) out += '\n' results.write(out) # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n') results.flush() if epoch >= 0: dirname = '{}_{}opts_saves/'.format(gamename, num_options) print("Loading weights from iteration: " + str(epoch)) filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch) saver.restore(U.get_session(), filename) ### episodes_so_far = 0 timesteps_so_far = 0 global iters_so_far iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, num_options=num_options, saves=saves, results=results, rewbuffer=rewbuffer, dc=dc) datas = [0 for _ in range(num_options)] while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) opt_d = [] for i in range(num_options): dur = np.mean( seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0. opt_d.append(dur) std = [] for i in range(num_options): logstd = np.mean( seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0. std.append(np.exp(logstd)) print("mean opt dur:", opt_d) print("mean op pol:", np.mean(np.array(seg['optpol_p']), axis=0)) print("mean term p:", np.mean(np.array(seg['term_p']), axis=0)) print("mean value val:", np.mean(np.array(seg['value_val']), axis=0)) ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg[ "adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values if iters_so_far % 5 == 0 and wsaves: print("weights are saved...") filename = dirname + '{}_epoch_{}.ckpt'.format( gamename, iters_so_far) save_path = saver.save(U.get_session(), filename) min_batch = 160 # Arbitrary t_advs = [[] for _ in range(num_options)] for opt in range(num_options): indices = np.where(opts == opt)[0] print("batch size:", indices.size) opt_d[opt] = indices.size if not indices.size: t_advs[opt].append(0.) continue ### This part is only necessasry when we use options. We proceed to these verifications in order not to discard any collected trajectories. if datas[opt] != 0: if (indices.size < min_batch and datas[opt].n > min_batch): datas[opt] = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue elif indices.size + datas[opt].n < min_batch: # pdb.set_trace() oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'], ob[indices])) cat_ac = np.concatenate((oldmap['ac'], ac[indices])) cat_atarg = np.concatenate( (oldmap['atarg'], atarg[indices])) cat_vtarg = np.concatenate( (oldmap['vtarg'], tdlamret[indices])) datas[opt] = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) t_advs[opt].append(0.) continue elif (indices.size + datas[opt].n > min_batch and datas[opt].n < min_batch) or (indices.size > min_batch and datas[opt].n < min_batch): oldmap = datas[opt].data_map cat_ob = np.concatenate((oldmap['ob'], ob[indices])) cat_ac = np.concatenate((oldmap['ac'], ac[indices])) cat_atarg = np.concatenate( (oldmap['atarg'], atarg[indices])) cat_vtarg = np.concatenate( (oldmap['vtarg'], tdlamret[indices])) datas[opt] = d = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent) if (indices.size > min_batch and datas[opt].n > min_batch): datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) elif datas[opt] == 0: datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent) ### optim_batchsize = optim_batchsize or ob.shape[0] optim_epochs = np.clip( np.int(10 * (indices.size / (timesteps_per_batch / num_options))), 10, 10) if num_options > 1 else optim_epochs print("optim epochs:", optim_epochs) logger.log("Optimizing...") # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): tadv, nodc_adv = pi.get_term_adv(batch["ob"], [opt]) tadv = tadv if num_options > 1 else np.zeros_like(tadv) t_advs[opt].append(nodc_adv) *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv) termg = termloss(batch["ob"], [opt], tadv) adam.update(termg[0], 5e-7 * cur_lrmult) adam.update(grads, optim_stepsize * cur_lrmult) losses.append(newlosses) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() ### Book keeping if saves: out = "{},{}" for _ in range(num_options): out += ",{},{},{},{}" out += "\n" info = [iters_so_far, np.mean(rewbuffer)] for i in range(num_options): info.append(opt_d[i]) for i in range(num_options): info.append(std[i]) for i in range(num_options): info.append(np.mean(np.array(seg['term_p']), axis=0)[i]) for i in range(num_options): info.append(np.mean(t_advs[i])) results.write(out.format(*info)) results.flush()
def apply_policy_to_sim_environment(timesteps, save_path, loop_num, num_hidden, sim_env_type): print("loop_num =", loop_num) env_id = 'CustomCartPole-v0' env = gym.make(env_id) from custom_env_wrap import custom_wrap #modify custom_wrap( env_id, #modify env.env, #modify model_switch=sim_env_type, #modify dirname="./result_apply/", #modify filename1="real_world_samples_input.csv", #modify filename2="real_world_samples_output.csv") #modify episode_count = 0 inputdata = [] outputdata = [] costdata = [] dt = cartpole_real_model.dt env.reset() env.env.state = np.array([0., 0., np.pi, 0.]) cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) logger.log("num_hidden =", num_hidden) policy = build_policy(env, "mlp", value_network='copy', num_hidden=num_hidden) with tf.variable_scope("pi"): pi = policy() U.initialize() pi.load("./result_apply/policy" + str(loop_num) + "/policy/") z, zdot, th, thdot = env.env.state prevz, prevzdot, prevth, prevthdot = z, zdot, th, thdot ob = env.env._get_obs() while True: # simulate onestep ac, vpred, _, _ = pi.step(ob, stochastic=True) ac = ac[0] ac = np.clip(ac, -cartpole_real_model.max_force, cartpole_real_model.max_force) print(z, zdot, th, thdot, ac) inputdata.append([z, zdot, th, thdot, ac]) ob, rew, new, _ = env.step(ac) z, zdot, th, thdot = env.env.state outputdata.append([(z - prevz) / dt, (zdot - prevzdot) / dt, (th - prevth) / dt, (thdot - prevthdot) / dt]) prevz, prevzdot, prevth, prevthdot = z, zdot, th, thdot costdata.append(rew) time.sleep(0.1) env.render() if len(outputdata) >= timesteps: print(np.array(costdata).sum()) break np_input = np.array(inputdata) np_output = np.array(outputdata) np_cost = np.array(costdata) np.savetxt(save_path + 'simulation_samples_input.csv', np_input, delimiter=',') np.savetxt(save_path + 'simulation_samples_output.csv', np_output, delimiter=',') np.savetxt(save_path + 'simulation_cost.csv', np_cost, delimiter=',') np.savetxt(save_path + 'simulation_total_cost.csv', np.array([np_cost.sum()]), delimiter=',', fmt='%.6f') env.close()
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # running averages with tf.variable_scope('o_stats') as vs: if reuse: vs.reuse_variables() self.o_stats = Normalizer(self.dimo, self.norm_eps, self.norm_clip, sess=self.sess) with tf.variable_scope('g_stats') as vs: if reuse: vs.reuse_variables() self.g_stats = Normalizer(self.dimg, self.norm_eps, self.norm_clip, sess=self.sess) # mini-batch sampling. batch = self.staging_tf.get() batch_tf = OrderedDict([(key, batch[i]) for i, key in enumerate(self.stage_shapes.keys())]) batch_tf['r'] = tf.reshape(batch_tf['r'], [-1, 1]) #choose only the demo buffer samples mask = np.concatenate((np.zeros(self.batch_size - self.demo_batch_size), np.ones(self.demo_batch_size)), axis = 0) # networks with tf.variable_scope('main') as vs: if reuse: vs.reuse_variables() self.main = self.create_actor_critic(batch_tf, net_type='main', **self.__dict__) vs.reuse_variables() with tf.variable_scope('target') as vs: if reuse: vs.reuse_variables() target_batch_tf = batch_tf.copy() target_batch_tf['o'] = batch_tf['o_2'] target_batch_tf['g'] = batch_tf['g_2'] self.target = self.create_actor_critic( target_batch_tf, net_type='target', **self.__dict__) vs.reuse_variables() assert len(self._vars("main")) == len(self._vars("target")) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = tf.clip_by_value(batch_tf['r'] + self.gamma * target_Q_pi_tf, *clip_range) self.Q_loss_tf = tf.reduce_mean(tf.square(tf.stop_gradient(target_tf) - self.main.Q_tf)) if self.bc_loss ==1 and self.q_filter == 1 : # train with demonstrations and use bc_loss and q_filter both maskMain = tf.reshape(tf.boolean_mask(self.main.Q_tf > self.main.Q_pi_tf, mask), [-1]) #where is the demonstrator action better than actor action according to the critic? choose those samples only #define the cloning loss on the actor's actions only on the samples which adhere to the above masks self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask(tf.boolean_mask((self.main.pi_tf), mask), maskMain, axis=0) - tf.boolean_mask(tf.boolean_mask((batch_tf['u']), mask), maskMain, axis=0))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) #primary loss scaled by it's respective weight prm_loss_weight self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) #L2 loss on action values scaled by the same weight prm_loss_weight self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf #adding the cloning loss to the actor loss as an auxilliary loss scaled by its weight aux_loss_weight elif self.bc_loss == 1 and self.q_filter == 0: # train with demonstrations without q_filter self.cloning_loss_tf = tf.reduce_sum(tf.square(tf.boolean_mask((self.main.pi_tf), mask) - tf.boolean_mask((batch_tf['u']), mask))) self.pi_loss_tf = -self.prm_loss_weight * tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.prm_loss_weight * self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) self.pi_loss_tf += self.aux_loss_weight * self.cloning_loss_tf else: #If not training with demonstrations self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u)) Q_grads_tf = tf.gradients(self.Q_loss_tf, self._vars('main/Q')) pi_grads_tf = tf.gradients(self.pi_loss_tf, self._vars('main/pi')) assert len(self._vars('main/Q')) == len(Q_grads_tf) assert len(self._vars('main/pi')) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self._vars('main/Q')) self.pi_grads_vars_tf = zip(pi_grads_tf, self._vars('main/pi')) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self._vars('main/Q')) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self._vars('main/pi')) # optimizers self.Q_adam = MpiAdam(self._vars('main/Q'), scale_grad_by_procs=False) self.pi_adam = MpiAdam(self._vars('main/pi'), scale_grad_by_procs=False) # polyak averaging self.main_vars = self._vars('main/Q') + self._vars('main/pi') self.target_vars = self._vars('target/Q') + self._vars('target/pi') self.stats_vars = self._global_vars('o_stats') + self._global_vars('g_stats') self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables tf.variables_initializer(self._global_vars('')).run() self._sync_optimizers() self._init_target_net()
def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, optimizer="adam", desired_kl=0.002): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\ epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = U.function(inputs, update_op) U.initialize() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr != None) enqueue_threads.extend(qr.create_threads(U.get_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************"%i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = common.discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1] adv_t = common.discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) logp_n = np.concatenate([path["logp"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") U.eval(tf.assign(stepsize, stepsize / 1.5)) elif kl < desired_kl / 2: logger.log("kl too low") U.eval(tf.assign(stepsize, stepsize * 1.5)) else: logger.log("kl just right!") logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() i += 1
def learn(network, env, seed=None, nsteps=20, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01, max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99, log_interval=50, buffer_size=50000, replay_ratio=8, replay_start=10000, c=10.0, trust_region=True, alpha=0.99, delta=1, replay_k=4, load_path=None, store_data=False, feat_dim=512, queue_size=1000, env_eval=None, eval_interval=300, use_eval_collect=True, use_expl_collect=True, aux_task="RF", dyna_source_list=["acer_eval", "acer_expl"], dist_type="l1", use_random_policy_expl=True, goal_shape=None, normalize_novelty=False, save_model=False, buffer2=True, **network_kwargs): ''' Main entrypoint for ACER (Actor-Critic with Experience Replay) algorithm (https://arxiv.org/pdf/1611.01224.pdf) Train an agent with given network architecture on a given environment using ACER. Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) (default: 20) nstack: int, size of the frame stack, i.e. number of the frames passed to the step model. Frames are stacked along channel dimension (last image dimension) (default: 4) total_timesteps: int, number of timesteps (i.e. number of actions taken in the environment) (default: 80M) q_coef: float, value function loss coefficient in the optimization objective (analog of vf_coef for other actor-critic methods) ent_coef: float, policy entropy coefficient in the optimization objective (default: 0.01) max_grad_norm: float, gradient norm clipping coefficient. If set to None, no clipping. (default: 10), lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output rprop_epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) rprop_alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting factor (default: 0.99) log_interval: int, number of updates between logging events (default: 100) buffer_size: int, size of the replay buffer (default: 50k) replay_ratio: int, now many (on average) batches of data to sample from the replay buffer take after batch from the environment (default: 4) replay_start: int, the sampling from the replay buffer does not start until replay buffer has at least that many samples (default: 10k) c: float, importance weight clipping factor (default: 10) trust_region bool, whether or not algorithms estimates the gradient KL divergence between the old and updated policy and uses it to determine step size (default: True) delta: float, max KL divergence between the old policy and updated policy (default: 1) alpha: float, momentum factor in the Polyak (exponential moving average) averaging of the model parameters (default: 0.99) load_path: str, path to load the model from (default: None) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' if sys.platform == "darwin": log_interval = 5 logger.info("Running Acer with following kwargs") logger.info(locals()) logger.info("\n") set_global_seeds(seed) if not isinstance(env, VecFrameStack): env = VecFrameStack(env, 1) if env_eval is None: raise ValueError("env_eval is required!") policy = build_policy(env, network, estimate_q=True, **network_kwargs) nenvs = env.num_envs nenvs_eval = env_eval.num_envs ob_space = env.observation_space ac_space = env.action_space nstack = env.nstack sess = get_session() dynamics = Dynamics(sess=sess, env=env, auxiliary_task=aux_task, queue_size=queue_size, feat_dim=feat_dim, normalize_novelty=normalize_novelty) dummy_dynamics = DummyDynamics(goal_shape) model_exploration = Model( sess=sess, policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, c=c, trust_region=trust_region, alpha=alpha, delta=delta, dynamics=dynamics, scope="acer_expl", goal_shape=goal_shape, ) model_evaluation = Model(sess=sess, policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma, max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, c=c, trust_region=trust_region, alpha=alpha, delta=delta, dynamics=dummy_dynamics, scope="acer_eval", goal_shape=goal_shape) def reward_fn_v1(current_state, desired_goal): eps = 1e-6 return np.exp(-np.sum(np.square(current_state - desired_goal), -1) / (eps + np.sum(np.square(desired_goal), -1))) def reward_fn_v2(current_pos_infos, goal_pos_infos, sparse=True): assert current_pos_infos.shape == goal_pos_infos.shape coeff = 0.03 threshold = 20 dist = vf_dist(current_pos_infos, goal_pos_infos) if sparse: rewards = (dist < threshold).astype(float) else: rewards = np.exp(-coeff * dist) return rewards assert dist_type in ["l1", "l2"] if dist_type == "l2": reward_fn = reward_fn_v1 else: reward_fn = reward_fn_v2 # we still need two runner to avoid one reset others' envs. runner_expl = Runner(env=env, model=model_exploration, nsteps=nsteps, store_data=store_data, reward_fn=reward_fn, sample_goal=True, dist_type=dist_type, alt_model=model_evaluation, use_random_policy_expl=use_random_policy_expl) runner_eval = Runner(env=env_eval, model=model_evaluation, nsteps=nsteps, store_data=store_data, reward_fn=reward_fn, sample_goal=False, dist_type=dist_type) if replay_ratio > 0: sample_goal_fn = make_sample_her_transitions("future", replay_k) assert env.num_envs == env_eval.num_envs if buffer2: buffer = ReplayBuffer(env=env, sample_goal_fn=sample_goal_fn, nsteps=nsteps, size=buffer_size, keys=get_store_keys(), reward_fn=reward_fn) else: buffer = Buffer(env=env, nsteps=nsteps, size=buffer_size, reward_fn=reward_fn, sample_goal_fn=sample_goal_fn, goal_shape=model_exploration.goal_shape) else: buffer = None nbatch_expl = nenvs * nsteps nbatch_eval = nenvs_eval * nsteps acer = Acer(runner_expl, runner_eval, model_exploration, model_evaluation, buffer, log_interval, dyna_source_list, save_model) acer.tstart = time.time() # === init to make sure we can get goal === acer.initialize() replay_start = replay_start * env.num_envs / (env.num_envs + env_eval.num_envs) onpolicy_cnt = 0 while acer.steps < total_timesteps: if use_eval_collect: acer.call(on_policy=True, model_name="eval", update_list=["expl", "eval"]) acer.steps += nbatch_eval onpolicy_cnt += 1 if use_expl_collect: acer.call(on_policy=True, model_name="expl", update_list=["eval", "expl"]) acer.steps += nbatch_expl onpolicy_cnt += 1 if replay_ratio > 0: n = replay_ratio for i in range(n): if buffer.has_atleast(replay_start): if i < n // 2: if i == 0: acer.call(on_policy=False, update_list=["expl", "eval"], use_cache=False) else: acer.call(on_policy=False, update_list=["expl", "eval"], use_cache=True) else: pass # acer.call(on_policy=False, update_list=["expl"]) if not use_eval_collect and onpolicy_cnt % eval_interval == 0: acer.evaluate(nb_eval=1) acer.save( os.path.join(logger.get_dir(), "models", "{}.pkl".format(acer.steps))) return model_evaluation
def build_adv(make_obs_tf, q_func, num_actions, epsilon, attack): with tf.variable_scope('deepq', reuse=tf.AUTO_REUSE): obs_tf_in = make_obs_tf("observation") stochastic_ph_adv = tf.placeholder(tf.bool, (), name="stochastic_adv") update_eps_ph_adv = tf.placeholder(tf.float32, (), name="update_eps_adv") eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0)) update_eps_expr_adv = eps.assign( tf.cond(update_eps_ph_adv >= 0, lambda: update_eps_ph_adv, lambda: eps)) def wrapper(x): return q_func(x, num_actions, scope="q_func", concat_softmax=False) if attack == 'fgsm': adversary = FastGradientMethod(CallableModelWrapper( wrapper, 'logits'), sess=U.get_session()) adv_observations = adversary.generate(obs_tf_in.get(), eps=epsilon, clip_min=0.0, clip_max=255.0, ord=np.inf) elif attack == 'bim': adversary = BasicIterativeMethod(CallableModelWrapper( wrapper, 'logits'), sess=U.get_session()) adv_observations = adversary.generate(obs_tf_in.get(), eps=epsilon, eps_iter=epsilon / 10, nb_iter=10, clip_min=0.0, clip_max=255.0, ord=np.inf) elif attack == 'deepfool': adversary = DeepFool(CallableModelWrapper(wrapper, 'logits'), sess=U.get_session()) adv_observations = adversary.generate(obs_tf_in.get(), clip_min=0.0, clip_max=255.0, nb_candidate=num_actions) elif attack == 'momentum': adversary = MomentumIterativeMethod(CallableModelWrapper( wrapper, 'logits'), sess=U.get_session()) adv_observations = adversary.generate(obs_tf_in.get(), eps=epsilon, eps_iter=epsilon / 10, nb_iter=10, clip_min=0.0, clip_max=255.0) elif attack == 'jsma': adversary = SaliencyMapMethod(CallableModelWrapper( wrapper, 'logits'), sess=U.get_session()) adv_observations = adversary.generate(obs_tf_in.get(), clip_min=0.0, clip_max=255.0) craft_adv_obs = U.function( inputs=[obs_tf_in, stochastic_ph_adv, update_eps_ph_adv], outputs=adv_observations, givens={ update_eps_ph_adv: -1.0, stochastic_ph_adv: True }, updates=[update_eps_expr_adv]) return craft_adv_obs
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear', is_async=True): self.sess = sess = get_session() nbatch = nenvs * nsteps with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE): self.model = step_model = policy(nenvs, 1, sess=sess) self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) self.logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV*neglogpac) entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params=params = find_trainable_variables("acktr_model") self.grads_check = grads = tf.gradients(train_loss,params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm) # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads,params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr, VF_LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map ) return policy_loss, value_loss, policy_entropy self.train = train self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def learn( *, network, env, eval_env, make_eval_env, env_id, seed, beta, total_timesteps, sil_update, sil_loss, timesteps_per_batch, # what to train on #num_samples=(1500,), num_samples=(1, ), #horizon=(5,), horizon=(2, ), #num_elites=(10,), num_elites=(1, ), max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation ent_coef=0.0, lr=3e-4, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=5, sil_value=0.01, sil_alpha=0.6, sil_beta=0.1, max_episodes=0, max_iters=0, # time constraint callback=None, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, vf_coef=0.5, max_grad_norm=0.5, log_interval=1, nminibatches=4, noptepochs=4, cliprange=0.2, TRPO=False, # MBL # For train mbl mbl_train_freq=5, # For eval num_eval_episodes=5, eval_freq=5, vis_eval=False, eval_targs=('mbmf', ), #eval_targs=('mf',), quant=2, # For mbl.step mbl_lamb=(1.0, ), mbl_gamma=0.99, #mbl_sh=1, # Number of step for stochastic sampling mbl_sh=10000, #vf_lookahead=-1, #use_max_vf=False, reset_per_step=(0, ), # For get_model num_fc=2, num_fwd_hidden=500, use_layer_norm=False, # For MBL num_warm_start=int(1e4), init_epochs=10, update_epochs=5, batch_size=512, update_with_validation=False, use_mean_elites=1, use_ent_adjust=0, adj_std_scale=0.5, # For data loading validation_set_path=None, # For data collect collect_val_data=False, # For traj collect traj_collect='mf', # For profile measure_time=True, eval_val_err=False, measure_rew=True, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if not isinstance(num_samples, tuple): num_samples = (num_samples, ) if not isinstance(horizon, tuple): horizon = (horizon, ) if not isinstance(num_elites, tuple): num_elites = (num_elites, ) if not isinstance(mbl_lamb, tuple): mbl_lamb = (mbl_lamb, ) if not isinstance(reset_per_step, tuple): reset_per_step = (reset_per_step, ) if validation_set_path is None: if collect_val_data: validation_set_path = os.path.join(logger.get_dir(), 'val.pkl') else: validation_set_path = os.path.join('dataset', '{}-val.pkl'.format(env_id)) if eval_val_err: eval_val_err_path = os.path.join('dataset', '{}-combine-val.pkl'.format(env_id)) logger.log(locals()) logger.log('MBL_SH', mbl_sh) logger.log('Traj_collect', traj_collect) set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) policy = build_policy(env, network, value_network='copy', copos=True, **network_kwargs) nenvs = env.num_envs np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * timesteps_per_batch nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) if model_fn is None: model_fn = Model discrete_ac_space = isinstance(ac_space, gym.spaces.Discrete) ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) make_model = lambda: Model( policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=timesteps_per_batch, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, sil_update=sil_update, sil_value=sil_value, sil_alpha=sil_alpha, sil_beta=sil_beta, sil_loss=sil_loss, # fn_reward=env.process_reward, fn_reward=None, # fn_obs=env.process_obs, fn_obs=None, ppo=False, prev_pi='pi', silm=pi) model = make_model() if load_path is not None: model.load(load_path) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) make_old_model = lambda: Model( policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=timesteps_per_batch, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, sil_update=sil_update, sil_value=sil_value, sil_alpha=sil_alpha, sil_beta=sil_beta, sil_loss=sil_loss, # fn_reward=env.process_reward, fn_reward=None, # fn_obs=env.process_obs, fn_obs=None, ppo=False, prev_pi='oldpi', silm=oldpi) old_model = make_old_model() # MBL # --------------------------------------- #viz = Visdom(env=env_id) win = None eval_targs = list(eval_targs) logger.log(eval_targs) make_model_f = get_make_mlp_model(num_fc=num_fc, num_fwd_hidden=num_fwd_hidden, layer_norm=use_layer_norm) mbl = MBL(env=eval_env, env_id=env_id, make_model=make_model_f, num_warm_start=num_warm_start, init_epochs=init_epochs, update_epochs=update_epochs, batch_size=batch_size, **network_kwargs) val_dataset = {'ob': None, 'ac': None, 'ob_next': None} if update_with_validation: logger.log('Update with validation') val_dataset = load_val_data(validation_set_path) if eval_val_err: logger.log('Log val error') eval_val_dataset = load_val_data(eval_val_err_path) if collect_val_data: logger.log('Collect validation data') val_dataset_collect = [] def _mf_pi(ob, t=None): stochastic = True ac, vpred, _, _ = pi.step(ob, stochastic=stochastic) return ac, vpred def _mf_det_pi(ob, t=None): #ac, vpred, _, _ = pi.step(ob, stochastic=False) ac, vpred = pi._evaluate([pi.pd.mode(), pi.vf], ob) return ac, vpred def _mf_ent_pi(ob, t=None): mean, std, vpred = pi._evaluate([pi.pd.mode(), pi.pd.std, pi.vf], ob) ac = np.random.normal(mean, std * adj_std_scale, size=mean.shape) return ac, vpred ################### use_ent_adjust======> adj_std_scale????????pi action sample def _mbmf_inner_pi(ob, t=0): if use_ent_adjust: return _mf_ent_pi(ob) else: #return _mf_pi(ob) if t < mbl_sh: return _mf_pi(ob) else: return _mf_det_pi(ob) # --------------------------------------- # Run multiple configuration once all_eval_descs = [] def make_mbmf_pi(n, h, e, l): def _mbmf_pi(ob): ac, rew = mbl.step(ob=ob, pi=_mbmf_inner_pi, horizon=h, num_samples=n, num_elites=e, gamma=mbl_gamma, lamb=l, use_mean_elites=use_mean_elites) return ac[None], rew return Policy(step=_mbmf_pi, reset=None) for n in num_samples: for h in horizon: for l in mbl_lamb: for e in num_elites: if 'mbmf' in eval_targs: all_eval_descs.append(('MeanRew', 'MBL_COPOS_SIL', make_mbmf_pi(n, h, e, l))) #if 'mbmf' in eval_targs: all_eval_descs.append(('MeanRew-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), 'MBL_TRPO-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), make_mbmf_pi(n, h, e, l))) if 'mf' in eval_targs: all_eval_descs.append( ('MeanRew', 'COPOS_SIL', Policy(step=_mf_pi, reset=None))) logger.log('List of evaluation targets') for it in all_eval_descs: logger.log(it[0]) pool = Pool(mp.cpu_count()) warm_start_done = False # ---------------------------------------- atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Initialize eta, omega optimizer if discrete_ac_space: init_eta = 1 init_omega = 0.5 eta_omega_optimizer = EtaOmegaOptimizerDiscrete( beta, max_kl, init_eta, init_omega) else: init_eta = 0.5 init_omega = 2.0 #????eta_omega_optimizer details????? eta_omega_optimizer = EtaOmegaOptimizer(beta, max_kl, init_eta, init_omega) # Prepare for rollouts # ---------------------------------------- if traj_collect == 'mf': seg_gen = traj_segment_generator(env, timesteps_per_batch, model, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() if traj_collect == 'mf-random' or traj_collect == 'mf-mb': seg_mbl = seg_gen_mbl.__next__() else: seg_mbl = seg add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] # Val data collection if collect_val_data: for ob_, ac_, ob_next_ in zip(ob[:-1, 0, ...], ac[:-1, ...], ob[1:, 0, ...]): val_dataset_collect.append( (copy.copy(ob_), copy.copy(ac_), copy.copy(ob_next_))) # ----------------------------- # MBL update else: ob_mbl, ac_mbl = seg_mbl["ob"], seg_mbl["ac"] mbl.add_data_batch(ob_mbl[:-1, 0, ...], ac_mbl[:-1, ...], ob_mbl[1:, 0, ...]) mbl.update_forward_dynamic(require_update=iters_so_far % mbl_train_freq == 0, ob_val=val_dataset['ob'], ac_val=val_dataset['ac'], ob_next_val=val_dataset['ob_next']) # ----------------------------- if traj_collect == 'mf': #if traj_collect == 'mf' or traj_collect == 'mf-random' or traj_collect == 'mf-mb': vpredbefore = seg[ "vpred"] # predicted value function before udpate model = seg["model"] atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "rms"): pi.rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() if TRPO: shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log( "Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log( "violated KL constraint. shrinking step.") elif improve < 0: logger.log( "surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) else: copos_update_dir = stepdir # Split direction into log-linear 'w_theta' and non-linear 'w_beta' parts w_theta, w_beta = pi.split_w(copos_update_dir) tmp_ob = np.zeros( (1, ) + env.observation_space.shape ) # We assume that entropy does not depend on the NN # Optimize eta and omega if discrete_ac_space: entropy = lossbefore[4] #entropy = - 1/timesteps_per_batch * np.sum(np.sum(pi.get_action_prob(ob) * pi.get_log_action_prob(ob), axis=1)) eta, omega = eta_omega_optimizer.optimize( pi.compute_F_w(ob, copos_update_dir), pi.get_log_action_prob(ob), timesteps_per_batch, entropy) else: Waa, Wsa = pi.w2W(w_theta) wa = pi.get_wa(ob, w_beta) varphis = pi.get_varphis(ob) #old_ent = old_entropy.eval({oldpi.ob: tmp_ob})[0] old_ent = lossbefore[4] eta, omega = eta_omega_optimizer.optimize( w_theta, Waa, Wsa, wa, varphis, pi.get_kt(), pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent) logger.log("Initial eta: " + str(eta) + " and omega: " + str(omega)) current_theta_beta = get_flat() prev_theta, prev_beta = pi.all_to_theta_beta( current_theta_beta) if discrete_ac_space: # Do a line search for both theta and beta parameters by adjusting only eta eta = eta_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, max_kl, args, discrete_ac_space) logger.log("Updated eta, eta: " + str(eta)) set_from_flat( pi.theta_beta_to_all(prev_theta, prev_beta)) # Find proper omega for new eta. Use old policy parameters first. eta, omega = eta_omega_optimizer.optimize( pi.compute_F_w(ob, copos_update_dir), pi.get_log_action_prob(ob), timesteps_per_batch, entropy, eta) logger.log("Updated omega, eta: " + str(eta) + " and omega: " + str(omega)) # do line search for ratio for non-linear "beta" parameter values #ratio = beta_ratio_line_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, # max_kl, beta, args) # set ratio to 1 if we do not use beta ratio line search ratio = 1 #print("ratio from line search: " + str(ratio)) cur_theta = (eta * prev_theta + w_theta.reshape(-1, )) / (eta + omega) cur_beta = prev_beta + ratio * w_beta.reshape( -1, ) / eta else: for i in range(2): # Do a line search for both theta and beta parameters by adjusting only eta eta = eta_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, max_kl, args) logger.log("Updated eta, eta: " + str(eta) + " and omega: " + str(omega)) # Find proper omega for new eta. Use old policy parameters first. set_from_flat( pi.theta_beta_to_all(prev_theta, prev_beta)) eta, omega = \ eta_omega_optimizer.optimize(w_theta, Waa, Wsa, wa, varphis, pi.get_kt(), pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent, eta) logger.log("Updated omega, eta: " + str(eta) + " and omega: " + str(omega)) # Use final policy logger.log("Final eta: " + str(eta) + " and omega: " + str(omega)) cur_theta = (eta * prev_theta + w_theta.reshape(-1, )) / (eta + omega) cur_beta = prev_beta + w_beta.reshape(-1, ) / eta set_from_flat(pi.theta_beta_to_all(cur_theta, cur_beta)) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) ##copos specific over if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) #cg over for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) #policy update over with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) with timed("SIL"): lrnow = lr(1.0 - timesteps_so_far / total_timesteps) l_loss, sil_adv, sil_samples, sil_nlogp = model.sil_train( lrnow) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if sil_update > 0: logger.record_tabular("SilSamples", sil_samples) if rank == 0: # MBL evaluation if not collect_val_data: #set_global_seeds(seed) default_sess = tf.get_default_session() def multithread_eval_policy(env_, pi_, num_episodes_, vis_eval_, seed): with default_sess.as_default(): if hasattr(env, 'ob_rms') and hasattr(env_, 'ob_rms'): env_.ob_rms = env.ob_rms res = eval_policy(env_, pi_, num_episodes_, vis_eval_, seed, measure_time, measure_rew) try: env_.close() except: pass return res if mbl.is_warm_start_done() and iters_so_far % eval_freq == 0: warm_start_done = mbl.is_warm_start_done() if num_eval_episodes > 0: targs_names = {} with timed('eval'): num_descs = len(all_eval_descs) list_field_names = [e[0] for e in all_eval_descs] list_legend_names = [e[1] for e in all_eval_descs] list_pis = [e[2] for e in all_eval_descs] list_eval_envs = [ make_eval_env() for _ in range(num_descs) ] list_seed = [seed for _ in range(num_descs)] list_num_eval_episodes = [ num_eval_episodes for _ in range(num_descs) ] print(list_field_names) print(list_legend_names) list_vis_eval = [ vis_eval for _ in range(num_descs) ] for i in range(num_descs): field_name, legend_name = list_field_names[ i], list_legend_names[i], res = multithread_eval_policy( list_eval_envs[i], list_pis[i], list_num_eval_episodes[i], list_vis_eval[i], seed) #eval_results = pool.starmap(multithread_eval_policy, zip(list_eval_envs, list_pis, list_num_eval_episodes, list_vis_eval,list_seed)) #for field_name, legend_name, res in zip(list_field_names, list_legend_names, eval_results): perf, elapsed_time, eval_rew = res logger.record_tabular(field_name, perf) if measure_time: logger.record_tabular( 'Time-%s' % (field_name), elapsed_time) if measure_rew: logger.record_tabular( 'SimRew-%s' % (field_name), eval_rew) targs_names[field_name] = legend_name if eval_val_err: fwd_dynamics_err = mbl.eval_forward_dynamic( obs=eval_val_dataset['ob'], acs=eval_val_dataset['ac'], obs_next=eval_val_dataset['ob_next']) logger.record_tabular('FwdValError', fwd_dynamics_err) logger.dump_tabular() #print(logger.get_dir()) #print(targs_names) # if num_eval_episodes > 0: # win = plot(viz, win, logger.get_dir(), targs_names=targs_names, quant=quant, opt='best') # ----------- #logger.dump_tabular() yield pi if collect_val_data: with open(validation_set_path, 'wb') as f: pickle.dump(val_dataset_collect, f) logger.log('Save {} validation data'.format(len(val_dataset_collect)))
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs ): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_variables(model_file) return act
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, adaptive_kl): sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training train_model = policy(None, nsteps, sess) # CREATE THE PLACEHOLDERS A = train_model.pdtype.sample_placeholder([None]) MEANNOW = train_model.pdtype.sample_placeholder([None]) LOGSTDNOW = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) # Keep track of old actor OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) NEGLOGPACNOW = tf.placeholder(tf.float32, [None]) RHO_NOW = tf.placeholder(tf.float32, [None]) # Keep track of old critic OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) # Cliprange CLIPRANGE = tf.placeholder(tf.float32, []) KLCONST = tf.placeholder(tf.float32, []) KL_REST = tf.placeholder(tf.float32, [None]) neglogpac = train_model.pd.neglogp(A) mean = train_model.pd.mean logstd = train_model.pd.logstd # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(- 0.5 * tf.square((A - mean) / tf.exp(logstd)) - logstd + 0.5 * tf.square( (A - MEANNOW) / tf.exp(LOGSTDNOW)) + LOGSTDNOW) sgn = tf.ones_like(ratio) * tf.expand_dims(tf.sign(ADV), 1) ratio_clip = tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Defining Loss = - J is equivalent to max J r = tf.reduce_prod(sgn * tf.minimum(ratio * sgn, ratio_clip * sgn),axis=-1) pg_losses = - r * ADV / tf.stop_gradient(tf.reduce_mean(r)) # * tf.minimum(1.0,RHO_NOW) # Final PG loss # pg_loss = tf.reduce_mean(tf.stop_gradient(tf.maximum(pg_losses, pg_losses2))*(-neglogpac)) + .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC) * KL_REST) approxklold = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) approxklnow = .5 * tf.reduce_mean(tf.square(neglogpac - NEGLOGPACNOW) * tf.minimum(1.0,RHO_NOW)) kloldnew = tf.reduce_mean(tf.reduce_sum( logstd - LOGSTDNOW + 0.5 * (tf.square(tf.exp(LOGSTDNOW)) + tf.square(mean - MEANNOW)) / tf.square( tf.exp(logstd)) - 0.5, axis=1)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) pg_loss = tf.reduce_mean(pg_losses) # * tf.minimum(1.0,RHO_NOW)) # Total loss# * tf.minimum(1.0,RHO_NOW)) if adaptive_kl: pg_loss = pg_loss + KLCONST*approxkl # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') print(params) # 2. Build our trainer if MPI is not None: trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) _grad_norm = tf.sqrt(tf.reduce_sum([tf.norm(grad) ** 2 for grad in grads])) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, klconst, rgae, trunc_rho, obs, returns, advs, masks, actions, values, neglogpacs, mean_now, logstd_now, kl_rest, rho_now, neglogpnow, states=None): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') # Normalize the advantages if rgae: r = np.minimum(trunc_rho,rho_now) radvs = r*advs advs = (advs - radvs.mean()/r.mean()) / (radvs.std() + 1e-8) else: advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr, CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values, MEANNOW:mean_now, LOGSTDNOW:logstd_now, KLCONST:klconst, KL_REST:kl_rest, RHO_NOW:rho_now, NEGLOGPACNOW:neglogpnow} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, kloldnew, approxklold, approxklnow, _grad_norm, _train], td_map )[:-1] self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'kloldnew', 'approxklold', 'approxklnow', 'gradnorm'] self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.meanlogstd = act_model.meanlogstd self.value = act_model.value self.values = train_model.value self.meanlogstds = train_model.meanlogstd self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta): sess = get_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) # actions D = tf.placeholder(tf.float32, [nbatch]) # dones R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's LR = tf.placeholder(tf.float32, []) eps = 1e-6 step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape) train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape) with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE): step_model = policy(nbatch=nenvs, nsteps=1, observ_placeholder=step_ob_placeholder, sess=sess) train_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess) params = find_trainable_variables("acer_model") print("Params {}".format(len(params))) for var in params: print(var) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) print(v.name) return v with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True): polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # action probability distributions according to train_model, polyak_model and step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(train_model.pi) polyak_model_p = tf.nn.softmax(polyak_model.pi) step_model_p = tf.nn.softmax(step_model.pi) v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)] # strip off last step f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, A) q_i = get_by_index(q, A) # Compute ratios for importance truncation rho = f / (MU + eps) rho_i = get_by_index(rho, A) # Calculate Q_retrace targets qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2) gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f loss_bc= -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]]*2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)] avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads] names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads'] if trust_region: run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj] names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'] def train(obs, actions, rewards, dones, mus, states, masks, steps): cur_lr = lr.value_steps(steps) td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks td_map[polyak_model.S] = states td_map[polyak_model.M] = masks return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train def _step(observation, **kwargs): return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs) self.train = train self.save = functools.partial(save_variables, sess=sess, variables=params) self.train_model = train_model self.step_model = step_model self._step = _step self.step = self.step_model.step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def learn( *, network, env, total_timesteps, timesteps_per_batch=1024, # what to train on max_kl=0.002, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation seed=None, ent_coef=0.00, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, num_reward=1, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) set_global_seeds(seed) # 创建policy policy = build_policy(env, network, value_network='copy', num_reward=num_reward, **network_kwargs) process_dir = logger.get_dir() save_dir = process_dir.split( 'Data')[-2] + 'log/mu/seed' + process_dir[-1] + '/' os.makedirs(save_dir, exist_ok=True) coe_save = [] impro_save = [] grad_save = [] adj_save = [] coe = np.ones((num_reward)) / num_reward np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space ################################################################# # ob ac ret atarg 都是 placeholder # ret atarg 此处应该是向量形式 ob = observation_placeholder(ob_space) # 创建pi和oldpi with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) # 每个reward都可以算一个atarg atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None, num_reward]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) #此处的KL div和entropy与reward无关 ################################## kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) # entbonus 是entropy loss entbonus = ent_coef * meanent ################################# ########################################################### # vferr 用来更新 v 网络 vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) # optimgain 用来更新 policy 网络, 应该每个reward有一个 optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] ########################################################### dist = meankl # 定义要优化的变量和 V 网络 adam 优化器 all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) # 把变量展开成一个向量的类 get_flat = U.GetFlat(var_list) # 这个类可以把一个向量分片赋值给var_list里的变量 set_from_flat = U.SetFromFlat(var_list) # kl散度的梯度 klgrads = tf.gradients(dist, var_list) #################################################################### # 拉直的向量 flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") # 把拉直的向量重新分成很多向量 shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz #################################################################### #################################################################### # 把kl散度梯度与变量乘积相加 gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 # 把gvp的梯度展成向量 fvp = U.flatgrad(gvp, var_list) #################################################################### # 用学习后的策略更新old策略 assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) # 计算loss compute_losses = U.function([ob, ac, atarg], losses) # 计算loss和梯度 compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) # 计算fvp compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) # 计算值网络的梯度 compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) if MPI is not None: out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers else: out = np.copy(x) return out # 初始化variable U.initialize() if load_path is not None: pi.load(load_path) # 得到初始化的参数向量 th_init = get_flat() if MPI is not None: MPI.COMM_WORLD.Bcast(th_init, root=0) # 把向量the_init的值分片赋值给var_list set_from_flat(th_init) #同步 vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- # 这是一个生成数据的迭代器 seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, num_reward=num_reward) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() # 双端队列 lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() # 计算累积回报 add_vtarg_and_adv(seg, gamma, lam, num_reward=num_reward) ###########$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ToDo # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) # ob, ac, atarg, tdlamret 的类型都是ndarray #ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] _, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] #print(seg['ob'].shape,type(seg['ob'])) #print(seg['ac'],type(seg['ac'])) #print(seg['adv'],type(seg['adv'])) #print(seg["tdlamret"].shape,type(seg['tdlamret'])) vpredbefore = seg["vpred"] # predicted value function before udpate # 标准化 #print("============================== atarg =========================================================") #print(atarg) atarg = (atarg - np.mean(atarg, axis=0)) / np.std( atarg, axis=0) # standardized advantage function estimate #atarg = (atarg) / np.max(np.abs(atarg),axis=0) #print('======================================= standardized atarg ====================================') #print(atarg) if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy ## set old parameter values to new parameter values assign_old_eq_new() G = None S = None mr_lossbefore = np.zeros((num_reward, len(loss_names))) grad_norm = np.zeros((num_reward + 1)) for i in range(num_reward): args = seg["ob"], seg["ac"], atarg[:, i] #print(atarg[:,i]) # 算是args的一个sample,每隔5个取出一个 fvpargs = [arr[::5] for arr in args] # 这个函数计算fisher matrix 与向量 p 的 乘积 def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p with timed("computegrad of " + str(i + 1) + ".th reward"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) mr_lossbefore[i] = lossbefore g = allmean(g) #print("***************************************************************") #print(g) if isinstance(G, np.ndarray): G = np.vstack((G, g)) else: G = g # g是目标函数的梯度 # 利用共轭梯度获得更新方向 if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg of " + str(i + 1) + ".th reward"): # stepdir 是更新方向 stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm grad_norm[i] = np.linalg.norm(fullstep) assert np.isfinite(stepdir).all() if isinstance(S, np.ndarray): S = np.vstack((S, stepdir)) else: S = stepdir #print('======================================= G ====================================') #print(G) #print('======================================= S ====================================') #print(S) try: new_coe = get_coefficient(G, S) #coe = 0.99 * coe + 0.01 * new_coe coe = new_coe coe_save.append(coe) #根据梯度的夹角调整参数 # GG = np.dot(S, S.T) # D = np.sqrt(np.diag(1/np.diag(GG))) # GG = np.dot(np.dot(D,GG),D) # #print('======================================= inner product ====================================') # #print(GG) # adj = np.sum(GG) / (num_reward ** 2) adj = 1 #print('======================================= adj ====================================') #print(adj) adj_save.append(adj) adj_max_kl = adj * max_kl ################################################################# grad_norm = grad_norm * np.sqrt(adj) stepdir = np.dot(coe, S) g = np.dot(coe, G) lossbefore = np.dot(coe, mr_lossbefore) ################################################################# shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / adj_max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm grad_norm[num_reward] = np.linalg.norm(fullstep) grad_save.append(grad_norm) expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() def compute_mr_losses(): mr_losses = np.zeros((num_reward, len(loss_names))) for i in range(num_reward): args = seg["ob"], seg["ac"], atarg[:, i] one_reward_loss = allmean(np.array(compute_losses(*args))) mr_losses[i] = one_reward_loss mr_loss = np.dot(coe, mr_losses) return mr_loss, mr_losses # 做10次搜索 for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) mr_loss_new, mr_losses_new = compute_mr_losses() mr_impro = mr_losses_new - mr_lossbefore meanlosses = surr, kl, *_ = allmean(np.array(mr_loss_new)) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > adj_max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") impro_save.append(np.hstack((mr_impro[:, 0], improve))) break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) with timed("vf"): #print('======================================= tdlamret ====================================') #print(seg["tdlamret"]) for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): #with tf.Session() as sess: # sess.run(tf.global_variables_initializer()) # aaa = sess.run(pi.vf,feed_dict={ob:mbob,ret:mbret}) # print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") # print(aaa.shape) # print(mbret.shape) g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) except: print('error') #print(mbob,mbret) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: logger.dump_tabular() #pdb.set_trace() np.save(save_dir + 'coe.npy', coe_save) np.save(save_dir + 'grad.npy', grad_save) np.save(save_dir + 'improve.npy', impro_save) np.save(save_dir + 'adj.npy', adj_save) return pi
def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs*nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Value loss vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def _serialize_variables(): sess = get_session() variables = tf.trainable_variables() values = sess.run(variables) return {var.name: value for var, value in zip(variables, values)}