def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, gamma=0.99, sil_update=1, fn_reward=None, fn_obs=None, sil_value=0.01, sil_alpha=0.6, sil_beta=0.1, sil_loss=0.1, mpi_rank_weight=1, comm=None, microbatch_size=None, ppo=True, prev_pi=None, silm=None): self.sess = sess = get_session() if MPI is not None and comm is None: comm = MPI.COMM_WORLD if ppo == True: with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) sil_model = policy(None, None, sess=sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder( tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value( train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer( comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm( grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.sil_model = sil_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state else: params = tf.trainable_variables(prev_pi) self.LR = LR = tf.placeholder(tf.float32, []) self.pi = silm if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer( comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) '''self.sil = SelfImitation(silm.X, silm.vf, silm.pd.entropy(), None, silm.pd.neglogp, ac_space, fn_reward=fn_reward, fn_obs=fn_obs, n_env=nbatch_act, n_update=sil_update, w_value=sil_value, w_entropy=ent_coef, gamma=gamma, max_steps=50000, max_nlogp=100, alpha=sil_alpha, beta=sil_beta)''' self.sil = SelfImitation(self.pi.X, self.pi.vf, self.pi.pd.entropy(), None, self.pi.pd.neglogp, ac_space, fn_reward=fn_reward, fn_obs=fn_obs, n_env=nbatch_act, n_update=sil_update, w_value=sil_value, w_entropy=ent_coef, gamma=gamma, max_steps=50000, max_nlogp=100, alpha=sil_alpha, beta=sil_beta) def sil_train(cur_lr): #td_map = {self.LR : cur_l} return self.sil.train(sess, cur_lr) self.sil.set_loss_weight(sil_loss) self.sil.build_train_op(params, self.trainer, LR) self.sil_train = sil_train self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', sil_update=4, sil_beta=0.0): sess = tf_util.make_session() nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) # sil and train model use the same params as step model? sil_model = policy(sess, ob_space, ac_space, nenvs, nsteps, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef value_avg = tf.reduce_mean(train_model.vf) params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer( learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = {train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, v_avg, _ = sess.run( [pg_loss, vf_loss, entropy, value_avg, _train], td_map ) return policy_loss, value_loss, policy_entropy, v_avg self.sil = SelfImitation(sil_model.X, sil_model.vf, sil_model.entropy, sil_model.value, sil_model.neg_log_prob, ac_space, np.sign, n_env=nenvs, n_update=sil_update, beta=sil_beta) self.sil.build_train_op(params, trainer, LR, max_grad_norm=max_grad_norm) def sil_train(): cur_lr = lr.value() return self.sil.train(sess, cur_lr) def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.sil_train = sil_train self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, gamma=0.99, sil_update=1, fn_reward=None, fn_obs=None, sil_value=0.01, sil_alpha=0.6, sil_beta=0.1): sess = tf.get_default_session() act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True) sil_model = policy(sess, ob_space, ac_space, None, None, reuse=True) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef with tf.variable_scope('model'): params = tf.trainable_variables() grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) _train = trainer.apply_gradients(grads) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): advs = returns - values advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1] self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] # Self-Imitation learning self.sil = SelfImitation(sil_model.X, sil_model.vf, sil_model.entropy, sil_model.value, sil_model.neg_log_prob, ac_space, fn_reward=fn_reward, fn_obs=fn_obs, n_env=nbatch_act, n_update=sil_update, w_value=sil_value, w_entropy=ent_coef, gamma=gamma, max_steps=50000, max_nlogp=100, alpha=sil_alpha, beta=sil_beta) self.sil.set_loss_weight(0.1) self.sil.build_train_op(params, trainer, LR) def sil_train(cur_lr): return self.sil.train(sess, cur_lr) def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) # If you want to load weights, also save/load observation scaling inside VecNormalize self.train = train self.train_model = train_model self.sil_train = sil_train self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101