def update_policy_params(self, comm, loss, mpi_rank_weight, LR, max_grad_norm): # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) return grads
def __init__(self, *, ac_space, policy_network, value_network=None, ent_coef, vf_coef, max_grad_norm): super(Model, self).__init__(name='PPO2Model') self.train_model = PolicyWithValue(ac_space, policy_network, value_network, estimate_q=False) if MPI is not None: self.optimizer = MpiAdamOptimizer( MPI.COMM_WORLD, self.train_model.trainable_variables) else: self.optimizer = tf.keras.optimizers.Adam() self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.step = self.train_model.step self.mode = self.train_model.mode self.value = self.train_model.value self.initial_state = self.train_model.initial_state self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] if MPI is not None: sync_from_root(self.variables)
def update_discriminator_params(self, comm, discriminator_loss, mpi_rank_weight, LR, max_grad_norm): # UPDATE DISCRIMINTATOR PARAMETERS USING DISCRIMINTATOR_LOSS # 1. Get the model parameters disc_params = tf.trainable_variables('discriminator_model') # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.disc_trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.disc_trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # self.disc_trainer = tf.train.GradientDescentOptimizer(learning_rate=LR) # 3. Calculate gradients disc_grads_and_var = self.disc_trainer.compute_gradients(discriminator_loss, disc_params) self._disc_train_op = self.disc_trainer.apply_gradients(disc_grads_and_var)
def get_train_op(self, loss, params, comm): # 2. Build our trainer if comm is not None and comm.Get_size() > 1: trainer = MpiAdamOptimizer(comm, learning_rate=self.LR, mpi_rank_weight=self.mpi_rank_weight, epsilon=1e-5) else: trainer = tf.train.AdamOptimizer(learning_rate=self.LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if self.max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, self.max_grad_norm) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da grads_and_var = list(zip(grads, var)) _train_op = trainer.apply_gradients(grads_and_var) return _train_op, grads_and_var
def update_vae_params(self, comm, loss, mpi_rank_weight, LR, max_grad_norm): params = tf.trainable_variables('vae') + tf.trainable_variables('ppo2_model/vae') if comm is not None and comm.Get_size() > 1: self.vae_trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.vae_trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) grads_and_var = self.vae_trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.vae_grads = grads self.vae_var = var self.vae_train_op = self.vae_trainer.apply_gradients(grads_and_var) return grads
def update_all_params(self, comm, ppo_loss, disc_loss, mpi_rank_weight, LR, max_grad_norm): ppo_params = tf.trainable_variables('ppo2_model') disc_params = tf.trainable_variables('discriminator_model') if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) ppo_var_and_grads = self.trainer.compute_gradients(ppo_loss, ppo_params) ppo_grads, ppo_var = zip(*ppo_var_and_grads) disc_var_and_grads = self.trainer.compute_gradients(disc_loss, disc_params) disc_grads, disc_var = zip(*disc_var_and_grads) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(ppo_grads + disc_grads, max_grad_norm) grads_and_var = list(zip(grads, ppo_var + disc_var)) self.all_train_op = self.trainer.apply_gradients(grads_and_var)
def optimize(self, learning_rate=6.25e-5, epsilon=1.5e-4, **adam_kwargs): """ Create a TF Op that optimizes the objective. Args: learning_rate: the Adam learning rate. epsilon: the Adam epsilon. """ if self.comm is not None and self.comm.Get_size() > 1: optim = MpiAdamOptimizer(self.comm, learning_rate=learning_rate, mpi_rank_weight=self.mpi_rank_weight, epsilon=epsilon, **adam_kwargs) else: optim = tf.train.AdamOptimizer(learning_rate=learning_rate, epsilon=epsilon, **adam_kwargs) if self.use_l2reg: params = tf.trainable_variables('online') weight_params = [v for v in params if '/bias' not in v.name] l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params]) self.loss = self.loss + l2_loss * 1e-4 return optim.minimize(self.loss)
class Model(object): """ We use this object to : __init__: - Creates the step_model - Creates the train_model train(): - Make the training part (feedforward and retropropagation of gradients) save/load(): - Save load the model """ def __init__(self, ob_space, ac_space, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, normalize_observations=True, normalize_returns=True, use_tensorboard=False, tb_log_dir=None): self.sess = sess = get_session() self.use_tensorboard = use_tensorboard if MPI is not None and comm is None: comm = MPI.COMM_WORLD # CREATE OUR TWO MODELS network_spec = [ { 'layer_type': 'dense', 'units': int (256), 'activation': 'relu', 'nodes_in': ['observation_self'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] } ] vnetwork_spec = [ { 'layer_type': 'dense', 'units': int (256), 'activation': 'relu', 'nodes_in': ['observation_self'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] } ] # Act model that is used for both sampling act_model = PpoPolicy(scope='ppo', ob_space=ob_space, ac_space=ac_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=True, reuse=False, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=True, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns) # Train model for training train_model = PpoPolicy(scope='ppo', ob_space=ob_space, ac_space=ac_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=True, reuse=True, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=True, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns) # CREATE THE PLACEHOLDERS self.A = A = {k: v.sample_placeholder([None]) for k, v in train_model.pdtypes.items()} self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = sum([train_model.pds[k].neglogp(A[k]) for k in train_model.pdtypes.keys()]) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. #entropy = tf.reduce_mean(train_model.entropy) entropy = tf.reduce_mean(sum([train_model.pds[k].entropy() for k in train_model.pdtypes.keys()])) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.scaled_value_tensor vpredclipped = OLDVPRED + tf.clip_by_value(vpred - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables(scope="ppo") # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.act self.value = act_model.value self.initial_state = act_model.zero_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if MPI is not None: sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101 if self.use_tensorboard: self.attach_tensorboard(tb_log_dir) self.tb_step = 0 def train(self, lr, cliprange, obs, actions, returns, values, neglogpacs, states=None): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advs = returns - values # Normalize the advantages advs = (advs - advs.mean()) / (advs.std() + 1e-8) # Turn the obs into correct format td_map = { self.ADV : advs, self.R : returns, self.LR : lr, self.CLIPRANGE : cliprange, self.OLDNEGLOGPAC : neglogpacs, self.OLDVPRED : values, } obs_map = {self.train_model.phs[k]: v for k, v in obs.items()} td_map.update(obs_map) actions_map = {self.A[k]: v for k, v in actions.items()} td_map.update(actions_map) if states is not None: pass #td_map[self.train_model.phs['policy_net_lstm2_state_c']] = np.repeat([states['policy_net_lstm2_state_c'][0]], len(obs), 0) #td_map[self.train_model.phs['policy_net_lstm2_state_h']] = np.repeat([states['policy_net_lstm2_state_h'][0]], len(obs), 0) #td_map[self.train_model.phs['vpred_net_lstm2_state_c']] = np.repeat([states['vpred_net_lstm2_state_c'][0]], len(obs), 0) #td_map[self.train_model.phs['vpred_net_lstm2_state_h']] = np.repeat([states['vpred_net_lstm2_state_h'][0]], len(obs), 0) if self.use_tensorboard: losses = self.sess.run(self.stats_list + [self._train_op, self.merged], td_map) self.tb_writer.add_summary(losses.pop(), self.tb_step) self.tb_step += 1 losses = losses[:-1] else: losses = self.sess.run(self.stats_list + [self._train_op], td_map)[:-1] return losses def attach_tensorboard(self, logdir): for i in range(len(self.stats_list)): tf.summary.scalar(self.loss_names[i], self.stats_list[i]) self.merged = tf.summary.merge_all() logdir = os.path.join(os.getcwd(), logdir) logdir = os.path.join(logdir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) self.tb_writer = tf.summary.FileWriter(logdir, self.sess.graph)
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training train_model = policy(None, nsteps, sess) # CREATE THE PLACEHOLDERS A = train_model.pdtype.sample_placeholder([None]) DIMSEL = train_model.pdtype.sample_placeholder([None]) MEANNOW = train_model.pdtype.sample_placeholder([None]) LOGSTDNOW = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) # Keep track of old actor OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) RHO_NOW = tf.placeholder(tf.float32, [None]) # Keep track of old critic OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) # Cliprange CLIPRANGE = tf.placeholder(tf.float32, []) KLCONST = tf.placeholder(tf.float32, []) KL_REST = tf.placeholder(tf.float32, [None]) neglogpac = train_model.pd.neglogp(A) mean = train_model.pd.mean logstd = train_model.pd.logstd # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp((-0.5 * tf.square( (A - mean) / tf.exp(logstd)) - logstd + 0.5 * tf.square( (A - MEANNOW) / tf.exp(LOGSTDNOW)) + LOGSTDNOW) * DIMSEL) #* tf.minimum(1.0,RHO_NOW) r = tf.reduce_prod(ratio, axis=-1) # Defining Loss = - J is equivalent to max J pg_losses = -tf.reduce_prod(ratio, axis=-1) * ADV #* tf.minimum(1.0,RHO_NOW) pg_losses2 = -tf.reduce_prod(tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE), axis=-1) * ADV #* tf.minimum(1.0,RHO_NOW) # Final PG loss # pg_loss = tf.reduce_mean(tf.stop_gradient(tf.maximum(pg_losses, pg_losses2))*(-neglogpac)) + .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - OLDNEGLOGPAC) * KL_REST) approxoldkl = .5 * tf.reduce_mean( tf.square(neglogpac - OLDNEGLOGPAC - tf.log(RHO_NOW))) kloldnew = tf.reduce_mean( tf.reduce_sum( logstd - LOGSTDNOW + 0.5 * (tf.square(tf.exp(LOGSTDNOW)) + tf.square(mean - MEANNOW)) / tf.square(tf.exp(logstd)) - 0.5, axis=1)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) pg_loss = tf.reduce_mean(tf.maximum( pg_losses, pg_losses2)) + KLCONST * approxkl # * tf.minimum(1.0,RHO_NOW)) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if MPI is not None: trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, klconst, dimsel, obs, returns, advs, masks, actions, values, neglogpacs, mean_now, logstd_now, rho_now, kl_rest, states=None): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') # Normalize the advantages advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values, MEANNOW: mean_now, LOGSTDNOW: logstd_now, KLCONST: klconst, RHO_NOW: rho_now, KL_REST: kl_rest, DIMSEL: dimsel } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run([ pg_loss, vf_loss, entropy, approxkl, clipfrac, kloldnew, approxoldkl, _train ], td_map)[:-1] self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'kloldnew', 'approxoldkl' ] self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.meanlogstd = act_model.meanlogstd self.value = act_model.value self.values = train_model.value self.meanlogstds = train_model.meanlogstd self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, ob_space, ac_space, max_grad_norm, beta, icm_lr_scale): sess = get_session() #TODO find a better way input_shape = [ob_space.shape[0], ob_space.shape[1], ob_space.shape[2]] self.action_shape = 36 # Placeholders self.state_ = phi_state = tf.placeholder(tf.float32, [None, *input_shape], name="icm_state") self.next_state_ = phi_next_state = tf.placeholder( tf.float32, [None, *input_shape], name="icm_next_state") self.action_ = action = tf.placeholder(tf.float32, [None], name="icm_action") with tf.variable_scope('icm_model'): # Feature encoding # Aka pass state and next_state to create phi(state), phi(next_state) # state --> phi(state) phi_state = self.feature_encoding(self.state_) with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): # next_state to phi(next_state) phi_next_state = self.feature_encoding(self.next_state_) # INVERSE MODEL pred_actions_logits, pred_actions_prob = self.inverse_model( phi_state, phi_next_state) # FORWARD MODEL pred_phi_next_state = self.forward_model(action, phi_state) # CALCULATE THE ICM LOSS # Inverse Loss LI # We calculate the cross entropy between our ât and at # Squeeze the labels (required) labels = tf.cast(action, tf.int32) self.inv_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pred_actions_logits, labels=labels), name="inverse_loss") # Foward Loss # LF = 1/2 || pred_phi_next_state - phi_next_state || # TODO 0.5 * ? self.forw_loss = tf.reduce_mean(tf.square( tf.subtract(pred_phi_next_state, phi_next_state)), name="forward_loss") # Todo predictor lr scale ? # ICM_LOSS = [(1 - beta) * LI + beta * LF ] * Predictor_Lr_scale self.icm_loss = ( (1 - beta) * self.inv_loss + beta * self.forw_loss) * icm_lr_scale # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters icm_params = tf.trainable_variables('icm_model') # 2. Build our trainer icm_trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=1e-4, epsilon=1e-5) # 3. Calculate the gradients icm_grads_and_var = icm_trainer.compute_gradients( self.icm_loss, icm_params) icm_grads, icm_var = zip(*icm_grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) icm_grads, icm__grad_norm = tf.clip_by_global_norm( icm_grads, max_grad_norm) icm_grads_and_var = list(zip(icm_grads, icm_var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da _icm_train = icm_trainer.apply_gradients(icm_grads_and_var) if MPI.COMM_WORLD.Get_rank() == 0: print("Initialize") initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") print("GLOBAL VARIABLES", global_variables) sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.observation_shape = observation_shape self.critic = critic self.actor = actor self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.actor_lr = tf.constant(actor_lr) self.critic_lr = tf.constant(critic_lr) # Observation normalization. if self.normalize_observations: with tf.name_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None # Return normalization. if self.normalize_returns: with tf.name_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. self.target_critic = Critic(actor.nb_actions, observation_shape, name='target_critic', network=critic.network, **critic.network_kwargs) self.target_actor = Actor(actor.nb_actions, observation_shape, name='target_actor', network=actor.network, **actor.network_kwargs) # Set up parts. if self.param_noise is not None: self.setup_param_noise() if MPI is not None: comm = MPI.COMM_WORLD self.actor_optimizer = MpiAdamOptimizer(comm, self.actor.trainable_variables) self.critic_optimizer = MpiAdamOptimizer(comm, self.critic.trainable_variables) else: self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr) logger.info('setting up actor optimizer') actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_variables] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) logger.info('setting up critic optimizer') critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_variables] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) if self.critic_l2_reg > 0.: critic_reg_vars = [] for layer in self.critic.network_builder.layers[1:]: critic_reg_vars.append(layer.kernel) for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) logger.info('setting up critic target updates ...') for var, target_var in zip(self.critic.variables, self.target_critic.variables): logger.info(' {} <- {}'.format(target_var.name, var.name)) logger.info('setting up actor target updates ...') for var, target_var in zip(self.actor.variables, self.target_actor.variables): logger.info(' {} <- {}'.format(target_var.name, var.name)) if self.param_noise: logger.info('setting up param noise') for var, perturbed_var in zip(self.actor.variables, self.perturbed_actor.variables): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) for var, perturbed_var in zip(self.actor.variables, self.perturbed_adaptive_actor.variables): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) if self.normalize_returns and self.enable_popart: self.setup_popart() self.initial_state = None # recurrent architectures not supported yet
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None): self.sess = sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
class Model(tf.Module): """ We use this object to : __init__: - Creates the step_model - Creates the train_model train(): - Make the training part (feedforward and retropropagation of gradients) save/load(): - Save load the model """ def __init__(self, *, ac_space, policy_network, value_network=None, ent_coef, vf_coef, max_grad_norm): super(Model, self).__init__(name='PPO2Model') self.train_model = PolicyWithValue(ac_space, policy_network, value_network, estimate_q=False) if MPI is not None: self.optimizer = MpiAdamOptimizer( MPI.COMM_WORLD, self.train_model.trainable_variables) else: self.optimizer = tf.keras.optimizers.Adam() self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.step = self.train_model.step self.mode = self.train_model.mode self.value = self.train_model.value self.initial_state = self.train_model.initial_state self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] if MPI is not None: sync_from_root(self.variables) def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpac_old, states=None): grads, pg_loss, vf_loss, entropy, approxkl, clipfrac = self.get_grad( cliprange, obs, returns, masks, actions, values, neglogpac_old) if MPI is not None: self.optimizer.apply_gradients(grads, lr) else: self.optimizer.learning_rate = lr grads_and_vars = zip(grads, self.train_model.trainable_variables) self.optimizer.apply_gradients(grads_and_vars) return pg_loss, vf_loss, entropy, approxkl, clipfrac @tf.function def get_grad(self, cliprange, obs, returns, masks, actions, values, neglogpac_old): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advs = returns - values # Normalize the advantages advs = (advs - tf.reduce_mean(advs)) / (tf.keras.backend.std(advs) + 1e-8) with tf.GradientTape() as tape: policy_latent = self.train_model.policy_network(obs) pd, _ = self.train_model.pdtype.pdfromlatent(policy_latent) neglogpac = pd.neglogp(actions) entropy = tf.reduce_mean(pd.entropy()) vpred = self.train_model.value(obs) vpredclipped = values + tf.clip_by_value(vpred - values, -cliprange, cliprange) vf_losses1 = tf.square(vpred - returns) vf_losses2 = tf.square(vpredclipped - returns) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(neglogpac_old - neglogpac) pg_losses1 = -advs * ratio pg_losses2 = -advs * tf.clip_by_value(ratio, 1 - cliprange, 1 + cliprange) pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2)) approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - neglogpac_old)) clipfrac = tf.reduce_mean( tf.cast(tf.greater(tf.abs(ratio - 1.0), cliprange), tf.float32)) loss = pg_loss - entropy * self.ent_coef + vf_loss * self.vf_coef var_list = self.train_model.trainable_variables grads = tape.gradient(loss, var_list) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) if MPI is not None: grads = tf.concat([tf.reshape(g, (-1, )) for g in grads], axis=0) return grads, pg_loss, vf_loss, entropy, approxkl, clipfrac
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): act_model = policy(nbatch_act, 1, sess) train_model = policy(nbatch_train, nsteps, sess) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = tf.trainable_variables('ppo2_model') trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): advs = returns - values advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1] self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") sync_from_root(sess, global_variables) #pylint: disable=E1101
class Model(object): def __init__(self, *, network, env, lr=3e-4, cliprange=0.2, nsteps=128, nminibatches=4, noptepochs=4, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, mpi_rank_weight=1, comm=None, microbatch_size=None, load_path=None, **network_kwargs): """ Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies.py env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) nminibatches: int number of training minibatches per update. For recurrent policies.py, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update ent_coef: float policy entropy coefficient in the optimization objective vf_coef: float value function loss coefficient in the optimization objective gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. """ self.sess = sess = get_session() if MPI is not None and comm is None: comm = MPI.COMM_WORLD policy = build_policy(env, network, **network_kwargs) self.env = env if isinstance(lr, float): self.lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): self.cliprange = constfn(cliprange) else: assert callable(cliprange) self.nminibatches = nminibatches # if eval_env is not None: # eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) # Calculate the batch_size self.nenvs = self.env.num_envs self.nsteps = nsteps self.nbatch = self.nenvs * self.nsteps self.nbatch_train = self.nbatch // nminibatches self.noptepochs = noptepochs with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(self.nenvs, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(self.nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder( [None]) # action placeholder self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # ratio 裁剪量 # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.def_path_pre = os.path.dirname( os.path.abspath(__file__)) + '/tmp/' initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables, comm=comm) # pylint: disable=E1101 if load_path is not None: self.load_newest(load_path) # Instantiate the runner object self.runner = Runner(env=self.env, model=self, nsteps=nsteps, gamma=gamma, lam=lam) def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advs = returns - values # Normalize the advantages advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { self.train_model.X: obs, self.A: actions, self.ADV: advs, self.R: returns, self.LR: lr, self.CLIPRANGE: cliprange, self.OLDNEGLOGPAC: neglogpacs, self.OLDVPRED: values } if states is not None: td_map[self.train_model.S] = states td_map[self.train_model.M] = masks return self.sess.run(self.stats_list + [self._train_op], td_map)[:-1] def learn(self, total_timesteps, seed=None, log_interval=10, save_interval=10): set_global_seeds(seed) total_timesteps = int(total_timesteps) # Calculate the batch_size is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) epinfobuf = deque(maxlen=100) # if eval_env is not None: # eval_epinfobuf = deque(maxlen=100) # Start total timer tfirststart = time.perf_counter() for update in range(1, total_timesteps): assert self.nbatch % self.nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / total_timesteps # Calculate the learning rate lrnow = self.lr(frac) # Calculate the cliprange cliprangenow = self.cliprange(frac) if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch obs, returns, masks, actions, values, neglogpacs, states, epinfos = self.runner.run( ) # pylint: disable=E0632 # if eval_env is not None: # eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() # pylint: disable=E0632 if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf.extend(epinfos) # if eval_env is not None: # eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(self.nbatch) for _ in range(self.noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, self.nbatch, self.nbatch_train): end = start + self.nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append( self.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert self.nenvs % self.nminibatches == 0 envsperbatch = self.nenvs // self.nminibatches envinds = np.arange(self.nenvs) flatinds = np.arange(self.nenvs * self.nsteps).reshape( self.nenvs, self.nsteps) for _ in range(self.noptepochs): np.random.shuffle(envinds) for start in range(0, self.nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( self.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(self.nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.record_tabular("misc/serial_timesteps", update * self.nsteps) logger.record_tabular("misc/nupdates", update) logger.record_tabular("misc/total_timesteps", update * self.nbatch) logger.record_tabular("fps", fps) logger.record_tabular("misc/explained_variance", float(ev)) logger.record_tabular( 'eprewmean', safe_mean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( 'eplenmean', safe_mean([epinfo['l'] for epinfo in epinfobuf])) # if eval_env is not None: # logger.record_tabular('eval_eprewmean', safe_mean([epinfo['r'] for epinfo in eval_epinfobuf])) # logger.record_tabular('eval_eplenmean', safe_mean([epinfo['l'] for epinfo in eval_epinfobuf])) logger.record_tabular('misc/time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, self.loss_names): logger.record_tabular('loss/' + lossname, lossval) if is_mpi_root: logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and is_mpi_root: file_name = time.strftime('Y%YM%mD%d_h%Hm%Ms%S', time.localtime(time.time())) model_save_path = self.def_path_pre + file_name self.save(model_save_path) return self def save(self, save_path=None): save_variables(save_path=save_path, sess=self.sess) print('save model variables to', save_path) def load_newest(self, load_path=None): file_list = os.listdir(self.def_path_pre) file_list.sort( key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x))) if load_path is None: load_path = os.path.join(self.def_path_pre, file_list[-1]) load_variables(load_path=load_path, sess=self.sess) print('load_path: ', load_path) def load_index(self, index, load_path=None): file_list = os.listdir(self.def_path_pre) file_list.sort( key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)), reverse=True) if load_path is None: load_path = os.path.join(self.def_path_pre, file_list[index]) load_variables(load_path=load_path, sess=self.sess) print('load_path: ', load_path)
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training train_model = policy(nbatch_train, nsteps, sess) # CREATE THE PLACEHOLDERS A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) # Keep track of old actor OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) # Cliprange CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value # Get the value predicted vpred = train_model.vf # Clip the value = Oldvalue + clip(value - oldvalue, min = - cliprange, max = cliprange) vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) # Value loss 0.5 * SUM [max(unclipped, clipped) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Remember we want ratio (pi current policy / pi old policy) # But neglopac returns us -log(policy) # So we want to transform it into ratio # e^(-log old - (-log new)) == e^(log new - log old) == e^(log(new / old)) # = new/old (since exponential function cancels log) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Remember also that we're doing gradient ascent, aka we want to MAXIMIZE the objective function which is equivalent to say # Loss = - J # To make objective function negative we can put a negation on the multiplication (pi new / pi old) * - Advantages pg_losses = -ADV * ratio # value, min [1 - e] , max [1 + e] pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss # Why maximum, because pg_loss_unclipped and pg_loss_clipped are negative, getting the min of positive elements = getting # the max of negative elements pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss (Remember that L = - J because it's the same thing than max J loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 4. Backpropagation _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advs = returns - values # Normalize the advantages advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr, CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map )[:-1] self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, agent, network, nsteps, rho, ent_coef, vf_coef, max_grad_norm, seed, load_path, **network_kwargs): super(AgentModel, self).__init__(name='MAPPO2Model') set_global_seeds(seed) # Get state_space and action_space ob_space = agent.observation_space ac_space = agent.action_space if isinstance(network, str): network_type = network policy_network_fn = get_network_builder(network_type)( **network_kwargs) network = policy_network_fn(ob_space.shape) self.train_model = PolicyWithValue(ac_space, network) if MPI is not None: self.optimizer = MpiAdamOptimizer( MPI.COMM_WORLD, self.train_model.trainable_variables) else: self.optimizer = tf.keras.optimizers.Adam() # if isinstance(network, str): # network = get_network_builder(network)(**network_kwargs) # policy_network = network(ob_space.shape) # value_network = network(ob_space.shape) # self.train_model = pi = PolicyWithValue(ac_space, policy_network, value_network) # self.pi_var_list = policy_network.trainable_variables + list(pi.pdtype.trainable_variables) # self.vf_var_list = value_network.trainable_variables + pi.value_fc.trainable_variables # if MPI is not None: # self.pi_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.pi_var_list) # self.vf_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.vf_var_list) # else: # self.pi_optimizer = tf.keras.optimizers.Adam() # self.vf_optimizer = tf.keras.optimizers.Adam() self.agent = agent self.nsteps = nsteps self.rho = rho self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.step = self.train_model.step self.value = self.train_model.value self.initial_state = self.train_model.initial_state self.loss_names = [ 'Lagrange_loss', 'sync_loss', 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] if MPI is not None: sync_from_root(self.variables) self.comm_matrix = agent.comm_matrix.copy() self.estimates = np.ones([agent.nmates, nsteps], dtype=np.float32) self.multipliers = np.zeros([agent.nmates, nsteps], dtype=np.float32) for i, comm_i in enumerate(self.comm_matrix): self.estimates[i] = comm_i[self.agent.id] * self.estimates[i] if load_path is not None: load_path = osp.expanduser(load_path) ckpt = tf.train.Checkpoint(model=self.train_model) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None) ckpt.restore(manager.latest_checkpoint)
class AgentModel(tf.Module): def __init__(self, agent, network, nsteps, rho, ent_coef, vf_coef, max_grad_norm, seed, load_path, **network_kwargs): super(AgentModel, self).__init__(name='MAPPO2Model') set_global_seeds(seed) # Get state_space and action_space ob_space = agent.observation_space ac_space = agent.action_space if isinstance(network, str): network_type = network policy_network_fn = get_network_builder(network_type)( **network_kwargs) network = policy_network_fn(ob_space.shape) self.train_model = PolicyWithValue(ac_space, network) if MPI is not None: self.optimizer = MpiAdamOptimizer( MPI.COMM_WORLD, self.train_model.trainable_variables) else: self.optimizer = tf.keras.optimizers.Adam() # if isinstance(network, str): # network = get_network_builder(network)(**network_kwargs) # policy_network = network(ob_space.shape) # value_network = network(ob_space.shape) # self.train_model = pi = PolicyWithValue(ac_space, policy_network, value_network) # self.pi_var_list = policy_network.trainable_variables + list(pi.pdtype.trainable_variables) # self.vf_var_list = value_network.trainable_variables + pi.value_fc.trainable_variables # if MPI is not None: # self.pi_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.pi_var_list) # self.vf_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.vf_var_list) # else: # self.pi_optimizer = tf.keras.optimizers.Adam() # self.vf_optimizer = tf.keras.optimizers.Adam() self.agent = agent self.nsteps = nsteps self.rho = rho self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.step = self.train_model.step self.value = self.train_model.value self.initial_state = self.train_model.initial_state self.loss_names = [ 'Lagrange_loss', 'sync_loss', 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] if MPI is not None: sync_from_root(self.variables) self.comm_matrix = agent.comm_matrix.copy() self.estimates = np.ones([agent.nmates, nsteps], dtype=np.float32) self.multipliers = np.zeros([agent.nmates, nsteps], dtype=np.float32) for i, comm_i in enumerate(self.comm_matrix): self.estimates[i] = comm_i[self.agent.id] * self.estimates[i] if load_path is not None: load_path = osp.expanduser(load_path) ckpt = tf.train.Checkpoint(model=self.train_model) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=None) ckpt.restore(manager.latest_checkpoint) def reinitial_estimates(self): self.estimates = np.random.normal( 0, 0.1, [self.agent.nmates, self.nsteps]).astype(np.float32) self.multipliers = np.random.uniform( 0, 1, [self.agent.nmates, self.nsteps]).astype(np.float32) for i, comm_i in enumerate(self.comm_matrix): self.estimates[i] = comm_i[self.agent.id] * self.estimates[i] def store_oldpi_var(self): pi_var_list = self.train_model.policy_network.trainable_variables + \ list(self.train_model.pdtype.trainable_variables) self.oldpi_var_list = [var.numpy() for var in pi_var_list] def assign_new_eq_old(self): pi_var_list = self.train_model.policy_network.trainable_variables + \ list(self.train_model.pdtype.trainable_variables) for pi_var, old_pi_var in zip(pi_var_list, self.oldpi_var_list): pi_var.assign(old_pi_var) # @tf.function # def get_vf_grad(self, cliprange, obs, returns, actions, values, advs, neglogpac_old): # with tf.GradientTape() as tape: # vpred = self.train_model.value(obs) # vpredclipped = values + tf.clip_by_value(vpred - values, -cliprange, cliprange) # vf_losses1 = tf.square(vpred - returns) # vf_losses2 = tf.square(vpredclipped - returns) # vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # vf_grads = tape.gradient(vf_loss, self.vf_var_list) # if self.max_grad_norm is not None: # vf_grads, _ = tf.clip_by_global_norm(vf_grads, self.max_grad_norm) # if MPI is not None: # vf_grads = tf.concat([tf.reshape(g, (-1,)) for g in vf_grads], axis=0) # return vf_grads, vf_loss @tf.function def get_pi_grad(self, cliprange, nb, estimates, multipliers, obs, returns, actions, values, advs, neglogpac_old): with tf.GradientTape() as tape: policy_latent = self.train_model.policy_network(obs) pd, logits = self.train_model.pdtype.pdfromlatent(policy_latent) neglogpac = pd.neglogp(actions) entropy = tf.reduce_mean(pd.entropy()) vpred = self.train_model.value(obs) vpredclipped = values + tf.clip_by_value(vpred - values, -cliprange, cliprange) vf_losses1 = tf.square(vpred - returns) vf_losses2 = tf.square(vpredclipped - returns) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(neglogpac_old - neglogpac) clipped_ratio = tf.clip_by_value(ratio, 1 - cliprange, 1 + cliprange) pg_losses1 = -advs * ratio pg_losses2 = -advs * clipped_ratio pg_loss = tf.reduce_mean(tf.maximum(pg_losses1, pg_losses2)) comm = self.comm_matrix[self.comm_matrix[:, nb] != 0][0, self.agent.id] syncerr = comm * ratio - estimates sync_loss = tf.reduce_mean(multipliers * syncerr) + \ 0.5 * self.rho * (tf.reduce_mean(tf.square(syncerr))) approxkl = .5 * tf.reduce_mean( tf.square(neglogpac - neglogpac_old)) clipfrac = tf.reduce_mean( tf.cast(tf.greater(tf.abs(ratio - 1.0), cliprange), tf.float32)) loss = pg_loss + sync_loss - entropy * self.ent_coef + vf_loss * self.vf_coef var_list = self.train_model.trainable_variables grads = tape.gradient(loss, var_list) if self.max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm) if MPI is not None: grads = tf.concat([tf.reshape(g, (-1, )) for g in grads], axis=0) return grads, loss, pg_loss, sync_loss, vf_loss, entropy, approxkl, clipfrac # pi_grads = tape.gradient(pi_loss, self.pi_var_list) # if self.max_grad_norm is not None: # pi_grads, _ = tf.clip_by_global_norm(pi_grads, self.max_grad_norm) # if MPI is not None: # pi_grads = tf.concat([tf.reshape(g, (-1,)) for g in pi_grads], axis=0) # return pi_grads, pi_loss, pg_loss, sync_loss, entropy, approxkl, clipfrac def pi_update(self, lr, cliprange, nb, obs, returns, actions, values, advs, neglogpacs_old): estimates = self.estimates[nb] multipliers = self.multipliers[nb] pi_grads, pi_loss, pg_loss, sync_loss, vf_loss, entropy, approxkl, clipfrac = self.get_pi_grad( cliprange, nb, estimates, multipliers, obs, returns, actions, values, advs, neglogpacs_old) if MPI is not None: self.optimizer.apply_gradients(pi_grads, lr) else: self.optimizer.learning_rate = lr grads_and_vars = zip(pi_grads, self.train_model.trainable_variables) self.optimizer.apply_gradients(grads_and_vars) return pi_loss, pg_loss, sync_loss, vf_loss, entropy, approxkl, clipfrac # if MPI is not None: # self.pi_optimizer.apply_gradients(pi_grads, lr) # else: # self.pi_optimizer.learning_rate = lr # grads_and_vars = zip(pi_grads, self.pi_var_list) # self.pi_optimizer.apply_gradients(grads_and_vars) # return pi_loss, pg_loss, sync_loss, entropy, approxkl, clipfrac # def vf_update(self, lr, cliprange, obs, returns, actions, values, advs, neglogpacs_old): # vf_grads, vf_loss = self.get_vf_grad( # cliprange, obs, returns, actions, values, advs, neglogpacs_old) # if MPI is not None: # self.vf_optimizer.apply_gradients(vf_grads, lr) # else: # self.vf_optimizer.learning_rate = lr # grads_and_vars = zip(vf_grads, self.train_model.trainable_variables) # self.vf_optimizer.apply_gradients(grads_and_vars) # return vf_loss def info_to_exchange(self, cliprange, ob, ac, neglogpac_old, nb): policy_latent = self.train_model.policy_network(ob) pd, logits = self.train_model.pdtype.pdfromlatent(policy_latent) neglogpac = pd.neglogp(ac) ratio = tf.exp(neglogpac_old - neglogpac) clipped_ratio = tf.clip_by_value(tf.exp(-neglogpac), 1 - cliprange, 1 + cliprange) return ratio, self.multipliers[nb] def exchange(self, cliprange, ob, ac, neglogpac_old, nb_ratio, nb_multipliers, nb): policy_latent = self.train_model.policy_network(ob) pd, logits = self.train_model.pdtype.pdfromlatent(policy_latent) neglogpac = pd.neglogp(ac) ratio = tf.exp(neglogpac_old - neglogpac) clipped_ratio = tf.clip_by_value(ratio, 1 - cliprange, 1 + cliprange) comm = self.comm_matrix[self.comm_matrix[:, nb] != 0][0, self.agent.id] v = 0.5 * (self.multipliers[nb] + nb_multipliers) + \ 0.5 * self.rho * (comm * ratio + (-comm) * nb_ratio) estimate = np.array((1.0 / self.rho) * (self.multipliers[nb] - v) + comm * ratio) self.estimates = tf.tensor_scatter_nd_update(self.estimates, [[nb]], estimate[None, :]) self.multipliers = tf.tensor_scatter_nd_update(self.multipliers, [[nb]], v[None, :])
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None, l1regpi, l2regpi, l1regvf, l2regvf, wclippi, wclipvf, todropoutpi, dropoutpi_keep_prob, dropoutpi_keep_prob_value, todropoutvf, dropoutvf_keep_prob, dropoutvf_keep_prob_value, isbnpitrainmode, isbnvftrainmode): self.sess = sess = get_session() #REGULARIZATION self.toregularizepi = l1regpi > 0 or l2regpi > 0 self.toregularizevf = l1regvf > 0 or l2regvf > 0 self.todropoutpi = todropoutpi self.todropoutvf = todropoutvf self.dropoutpi_keep_prob = dropoutpi_keep_prob #TENSOR self.dropoutpi_keep_prob_value = dropoutpi_keep_prob_value self.dropoutvf_keep_prob = dropoutvf_keep_prob self.dropoutvf_keep_prob_value = dropoutvf_keep_prob_value self.isbnpitrainmode = isbnpitrainmode self.isbnvftrainmode = isbnvftrainmode self.toweightclippi = wclippi > 0 self.toweightclipvf = wclipvf > 0 with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef if self.toregularizepi: print("regularizing policy network: L1 = {}, L2 = {}".format( l1regpi, l2regpi)) regularizerpi = tf.contrib.layers.l1_l2_regularizer( scale_l1=l1regpi, scale_l2=l2regpi, scope='ppo2_model/pi') all_trainable_weights_pi = tf.trainable_variables('ppo2_model/pi') regularization_penalty_pi = tf.contrib.layers.apply_regularization( regularizerpi, all_trainable_weights_pi) loss = loss + regularization_penalty_pi if self.toregularizevf: print("regularizing value network: L1 = {}, L2 = {}".format( l1regvf, l2regvf)) regularizervf = tf.contrib.layers.l1_l2_regularizer( scale_l1=l1regvf, scale_l2=l2regvf, scope='ppo2_model/vf') all_trainable_weights_vf = tf.trainable_variables('ppo2_model/vf') regularization_penalty_vf = tf.contrib.layers.apply_regularization( regularizervf, all_trainable_weights_vf) loss = loss + regularization_penalty_vf # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients #self._update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS) #with tf.control_dependencies(self._update_op): grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) if self.toweightclippi: print("clipping policy network = {}".format(wclippi)) policyparams = tf.trainable_variables('ppo2_model/pi') self._wclip_ops_pi = [] for toclipvar in policyparams: if 'logstd' in toclipvar.name: continue self._wclip_ops_pi.append( tf.assign(toclipvar, tf.clip_by_value(toclipvar, -wclippi, wclippi))) self._wclip_op_pi = tf.group(*self._wclip_ops_pi) if self.toweightclipvf: print("clipping value network = {}".format(wclipvf)) valueparams = tf.trainable_variables('ppo2_model/vf') self._wclip_ops_vf = [] for toclipvar in valueparams: self._wclip_ops_vf.append( tf.assign(toclipvar, tf.clip_by_value(toclipvar, -wclipvf, wclipvf))) self._wclip_op_vf = tf.group(*self._wclip_ops_vf) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] if self.toregularizepi: self.loss_names.append('regularization_pi') self.stats_list.append(regularization_penalty_pi) if self.toregularizevf: self.loss_names.append('regularization_vf') self.stats_list.append(regularization_penalty_vf) self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): act_model = policy(nbatch_act, 1, sess) train_model = policy(nbatch_train, nsteps, sess) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = tf.trainable_variables('ppo2_model') trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) grads_and_var = trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) _train = trainer.apply_gradients(grads_and_var) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): advs = returns - values advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: lr, CLIPRANGE: cliprange, OLDNEGLOGPAC: neglogpacs, OLDVPRED: values } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run( [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1] self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state # def save(file_name): # save_path = "/media/rustam/88E4BD3EE4BD2EF6/thesis/modeling/python/training/ppo_model_backups/" # ps = sess.run(params) # joblib.dump(ps, save_path+file_name) # print("\n------------\nModel with name '{}' saved successfully!\n------------\n".format(file_name)) # # def load(path_to_file): # load_path = "/media/rustam/88E4BD3EE4BD2EF6/thesis/modeling/python/training/ppo_model_backups/promising_ones/" # file_name = LOAD_FILENAME # if path_to_file is None: # path_to_file = load_path + file_name # loaded_params = joblib.load(path_to_file) # restores = [] # for p, loaded_p in zip(params, loaded_params): # restores.append(p.assign(loaded_p)) # sess.run(restores) # print("Model with name '{}' was successfully loaded!".format(file_name)) # was uncommented self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) # uncommented # self.save = save # functools.partial(save_variables, sess=sess) # self.load = load # functools.partial(load_variables, sess=sess) if MPI.COMM_WORLD.Get_rank() == 0: initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, ob_space, ac_space, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, normalize_observations=True, normalize_returns=True, use_tensorboard=False, tb_log_dir=None): self.sess = sess = get_session() self.use_tensorboard = use_tensorboard if MPI is not None and comm is None: comm = MPI.COMM_WORLD # CREATE OUR TWO MODELS network_spec = [ { 'layer_type': 'dense', 'units': int (256), 'activation': 'relu', 'nodes_in': ['observation_self'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] } ] vnetwork_spec = [ { 'layer_type': 'dense', 'units': int (256), 'activation': 'relu', 'nodes_in': ['observation_self'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] }, { 'layer_type': 'dense', 'units': int (128), 'activation': 'relu', 'nodes_in': ['main'], 'nodes_out': ['main'] } ] # Act model that is used for both sampling act_model = PpoPolicy(scope='ppo', ob_space=ob_space, ac_space=ac_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=True, reuse=False, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=True, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns) # Train model for training train_model = PpoPolicy(scope='ppo', ob_space=ob_space, ac_space=ac_space, network_spec=network_spec, v_network_spec=vnetwork_spec, stochastic=True, reuse=True, build_act=True, trainable_vars=None, not_trainable_vars=None, gaussian_fixed_var=True, weight_decay=0.0, ema_beta=0.99999, normalize_observations=normalize_observations, normalize_returns=normalize_returns) # CREATE THE PLACEHOLDERS self.A = A = {k: v.sample_placeholder([None]) for k, v in train_model.pdtypes.items()} self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = sum([train_model.pds[k].neglogp(A[k]) for k in train_model.pdtypes.keys()]) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. #entropy = tf.reduce_mean(train_model.entropy) entropy = tf.reduce_mean(sum([train_model.pds[k].entropy() for k in train_model.pdtypes.keys()])) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.scaled_value_tensor vpredclipped = OLDVPRED + tf.clip_by_value(vpred - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables(scope="ppo") # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.act self.value = act_model.value self.initial_state = act_model.zero_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if MPI is not None: sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101 if self.use_tensorboard: self.attach_tensorboard(tb_log_dir) self.tb_step = 0
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, lf_coef, max_grad_norm, init_labda=1., microbatch_size=None, threshold=1.): self.sess = sess = get_session() with tf.variable_scope('ppo2_lyapunov_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.l_ADV = l_ADV = tf.placeholder(tf.float32, [None]) # 这两个R都是带衰减的R self.R = R = tf.placeholder(tf.float32, [None]) self.v_l = v_l = tf.placeholder(tf.float32, [None]) log_labda = tf.get_variable('ppo2_lyapunov_model/Labda', None, tf.float32, initializer=tf.log(init_labda)) self.labda = tf.exp(log_labda) self.safety_threshold = tf.placeholder(tf.float32, None, 'threshold') self.threshold = threshold # self.log_labda = tf.placeholder(tf.float32, None, 'Labda') # self.labda = tf.constant(10.) # self.Lam=10. # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.OLDLPRED = OLDLPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Get the predicted value lpred = train_model.lf lpredclipped = OLDLPRED + tf.clip_by_value(train_model.lf - OLDLPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value lf_losses1 = tf.square(lpred - v_l) # Clipped value lf_losses2 = tf.square(lpredclipped - v_l) lf_loss = .5 * tf.reduce_mean(tf.maximum(lf_losses1, lf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining safety loss lpred = train_model.lf lpred_ = train_model.lf_ # self.l_lambda = tf.reduce_mean(ratio * tf.stop_gradient(lpred_) - tf.stop_gradient(lpred)) l_lambda1 = tf.reduce_mean(ratio * l_ADV + v_l - self.safety_threshold) l_lambda2 = tf.reduce_mean( tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) * l_ADV + v_l - self.safety_threshold) l_lambda = tf.maximum(l_lambda1, l_lambda2) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))+ l_lambda*tf.stop_gradient(self.labda) - \ tf.stop_gradient(l_lambda) * log_labda # pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)+ self.l_lambda * self.labda) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + lf_loss * lf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_lyapunov_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'safety_value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'lagrangian' ] self.stats_list = [ pg_loss, vf_loss, lf_loss, entropy, approxkl, clipfrac, self.labda ] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.eval_step = act_model.eval_step self.value = act_model.value self.l_value = act_model.l_value self.l_value_ = act_model.l_value_ self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, proportion_of_exp_used_for_predictor_update, microbatch_size=None): self.sess = sess = get_session() with tf.variable_scope('rnd_ppo_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # Create our RND model that will generate our intrinsic rewards rnd_model = RND(ob_space, proportion_of_exp_used_for_predictor_update) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.INT_R = INT_R = tf.placeholder(tf.float32, [None]) self.EXT_R = EXT_R = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss vf_loss_int = (0.5 * vf_coef) * tf.reduce_mean( tf.square(train_model.vf_int - self.INT_R)) vf_loss_ext = (0.5 * vf_coef) * tf.reduce_mean( tf.square(train_model.vf_ext - self.EXT_R)) vf_loss = vf_loss_int + vf_loss_ext # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss + rnd_model.rnd_loss # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('rnd_ppo_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'rnd_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.stats_list = [ pg_loss, vf_loss, rnd_model.rnd_loss, entropy, approxkl, clipfrac ] self.train_model = train_model self.act_model = act_model self.rnd_model = rnd_model self.step = act_model.step self.values = act_model.values self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
class Model(object): """ We use this object to : __init__: - Creates the step_model - Creates the train_model train(): - Make the training part (feedforward and retropropagation of gradients) save/load(): - Save load the model """ def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None): self.sess = sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training #创建策略网络和值网络的时候指定batchsize构建placeholder if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101 def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advs = returns - values # Normalize the advantages advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = { self.train_model.X: obs, self.A: actions, self.ADV: advs, self.R: returns, self.LR: lr, self.CLIPRANGE: cliprange, self.OLDNEGLOGPAC: neglogpacs, self.OLDVPRED: values } if states is not None: td_map[self.train_model.S] = states td_map[self.train_model.M] = masks return self.sess.run(self.stats_list + [self._train_op], td_map)[:-1]
class DDPG(tf.Module): def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None, gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True, batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf), critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.): # Parameters. self.gamma = gamma self.tau = tau self.memory = memory self.normalize_observations = normalize_observations self.normalize_returns = normalize_returns self.action_noise = action_noise self.param_noise = param_noise self.action_range = action_range self.return_range = return_range self.observation_range = observation_range self.observation_shape = observation_shape self.critic = critic self.actor = actor self.clip_norm = clip_norm self.enable_popart = enable_popart self.reward_scale = reward_scale self.batch_size = batch_size self.stats_sample = None self.critic_l2_reg = critic_l2_reg self.actor_lr = tf.constant(actor_lr) self.critic_lr = tf.constant(critic_lr) # Observation normalization. if self.normalize_observations: with tf.name_scope('obs_rms'): self.obs_rms = RunningMeanStd(shape=observation_shape) else: self.obs_rms = None # Return normalization. if self.normalize_returns: with tf.name_scope('ret_rms'): self.ret_rms = RunningMeanStd() else: self.ret_rms = None # Create target networks. self.target_critic = Critic(actor.nb_actions, observation_shape, name='target_critic', network=critic.network, **critic.network_kwargs) self.target_actor = Actor(actor.nb_actions, observation_shape, name='target_actor', network=actor.network, **actor.network_kwargs) # Set up parts. if self.param_noise is not None: self.setup_param_noise() if MPI is not None: comm = MPI.COMM_WORLD self.actor_optimizer = MpiAdamOptimizer(comm, self.actor.trainable_variables) self.critic_optimizer = MpiAdamOptimizer(comm, self.critic.trainable_variables) else: self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr) self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr) logger.info('setting up actor optimizer') actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_variables] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) logger.info('setting up critic optimizer') critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_variables] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) if self.critic_l2_reg > 0.: critic_reg_vars = [] for layer in self.critic.network_builder.layers[1:]: critic_reg_vars.append(layer.kernel) for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) logger.info('setting up critic target updates ...') for var, target_var in zip(self.critic.variables, self.target_critic.variables): logger.info(' {} <- {}'.format(target_var.name, var.name)) logger.info('setting up actor target updates ...') for var, target_var in zip(self.actor.variables, self.target_actor.variables): logger.info(' {} <- {}'.format(target_var.name, var.name)) if self.param_noise: logger.info('setting up param noise') for var, perturbed_var in zip(self.actor.variables, self.perturbed_actor.variables): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) for var, perturbed_var in zip(self.actor.variables, self.perturbed_adaptive_actor.variables): if var in actor.perturbable_vars: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) else: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) if self.normalize_returns and self.enable_popart: self.setup_popart() self.initial_state = None # recurrent architectures not supported yet def setup_param_noise(self): assert self.param_noise is not None # Configure perturbed actor. self.perturbed_actor = Actor(self.actor.nb_actions, self.observation_shape, name='param_noise_actor', network=self.actor.network, **self.actor.network_kwargs) # Configure separate copy for stddev adoption. self.perturbed_adaptive_actor = Actor(self.actor.nb_actions, self.observation_shape, name='adaptive_param_noise_actor', network=self.actor.network, **self.actor.network_kwargs) def setup_popart(self): # See https://arxiv.org/pdf/1602.07714.pdf for details. for vs in [self.critic.output_vars, self.target_critic.output_vars]: assert len(vs) == 2 M, b = vs assert 'kernel' in M.name assert 'bias' in b.name assert M.get_shape()[-1] == 1 assert b.get_shape()[-1] == 1 @tf.function def step(self, obs, apply_noise=True, compute_Q=True): normalized_obs = tf.clip_by_value(normalize(obs, self.obs_rms), self.observation_range[0], self.observation_range[1]) actor_tf = self.actor(normalized_obs) if self.param_noise is not None and apply_noise: action = self.perturbed_actor(normalized_obs) else: action = actor_tf if compute_Q: normalized_critic_with_actor_tf = self.critic(normalized_obs, actor_tf) q = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) else: q = None if self.action_noise is not None and apply_noise: noise = self.action_noise() action += noise action = tf.clip_by_value(action, self.action_range[0], self.action_range[1]) return action, q, None, None def store_transition(self, obs0, action, reward, obs1, terminal1): reward *= self.reward_scale B = obs0.shape[0] for b in range(B): self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b]) if self.normalize_observations: self.obs_rms.update(np.array([obs0[b]])) def train(self): batch = self.memory.sample(batch_size=self.batch_size) obs0, obs1 = tf.constant(batch['obs0']), tf.constant(batch['obs1']) actions, rewards, terminals1 = tf.constant(batch['actions']), tf.constant(batch['rewards']), tf.constant(batch['terminals1'], dtype=tf.float32) normalized_obs0, target_Q = self.compute_normalized_obs0_and_target_Q(obs0, obs1, rewards, terminals1) if self.normalize_returns and self.enable_popart: old_mean = self.ret_rms.mean old_std = self.ret_rms.std self.ret_rms.update(target_Q.flatten()) # renormalize Q outputs new_mean = self.ret_rms.mean new_std = self.ret_rms.std for vs in [self.critic.output_vars, self.target_critic.output_vars]: kernel, bias = vs kernel.assign(kernel * old_std / new_std) bias.assign((bias * old_std + old_mean - new_mean) / new_std) actor_grads, actor_loss = self.get_actor_grads(normalized_obs0) critic_grads, critic_loss = self.get_critic_grads(normalized_obs0, actions, target_Q) if MPI is not None: self.actor_optimizer.apply_gradients(actor_grads, self.actor_lr) self.critic_optimizer.apply_gradients(critic_grads, self.critic_lr) else: self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor.trainable_variables)) self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic.trainable_variables)) return critic_loss, actor_loss @tf.function def compute_normalized_obs0_and_target_Q(self, obs0, obs1, rewards, terminals1): normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_obs1 = tf.clip_by_value(normalize(obs1, self.obs_rms), self.observation_range[0], self.observation_range[1]) Q_obs1 = denormalize(self.target_critic(normalized_obs1, self.target_actor(normalized_obs1)), self.ret_rms) target_Q = rewards + (1. - terminals1) * self.gamma * Q_obs1 return normalized_obs0, target_Q @tf.function def get_actor_grads(self, normalized_obs0): with tf.GradientTape() as tape: actor_tf = self.actor(normalized_obs0) normalized_critic_with_actor_tf = self.critic(normalized_obs0, actor_tf) critic_with_actor_tf = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) actor_loss = -tf.reduce_mean(critic_with_actor_tf) actor_grads = tape.gradient(actor_loss, self.actor.trainable_variables) if self.clip_norm: actor_grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in actor_grads] if MPI is not None: actor_grads = tf.concat([tf.reshape(g, (-1,)) for g in actor_grads], axis=0) return actor_grads, actor_loss @tf.function def get_critic_grads(self, normalized_obs0, actions, target_Q): with tf.GradientTape() as tape: normalized_critic_tf = self.critic(normalized_obs0, actions) normalized_critic_target_tf = tf.clip_by_value(normalize(target_Q, self.ret_rms), self.return_range[0], self.return_range[1]) critic_loss = tf.reduce_mean(tf.square(normalized_critic_tf - normalized_critic_target_tf)) # The first is input layer, which is ignored here. if self.critic_l2_reg > 0.: # Ignore the first input layer. for layer in self.critic.network_builder.layers[1:]: # The original l2_regularizer takes half of sum square. critic_loss += (self.critic_l2_reg / 2.)* tf.reduce_sum(tf.square(layer.kernel)) critic_grads = tape.gradient(critic_loss, self.critic.trainable_variables) if self.clip_norm: critic_grads = [tf.clip_by_norm(grad, clip_norm=self.clip_norm) for grad in critic_grads] if MPI is not None: critic_grads = tf.concat([tf.reshape(g, (-1,)) for g in critic_grads], axis=0) return critic_grads, critic_loss def initialize(self): if MPI is not None: sync_from_root(self.actor.trainable_variables + self.critic.trainable_variables) self.target_actor.set_weights(self.actor.get_weights()) self.target_critic.set_weights(self.critic.get_weights()) @tf.function def update_target_net(self): for var, target_var in zip(self.actor.variables, self.target_actor.variables): target_var.assign((1. - self.tau) * target_var + self.tau * var) for var, target_var in zip(self.critic.variables, self.target_critic.variables): target_var.assign((1. - self.tau) * target_var + self.tau * var) def get_stats(self): if self.stats_sample is None: # Get a sample and keep that fixed for all further computations. # This allows us to estimate the change in value for the same set of inputs. self.stats_sample = self.memory.sample(batch_size=self.batch_size) obs0 = self.stats_sample['obs0'] actions = self.stats_sample['actions'] normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) normalized_critic_tf = self.critic(normalized_obs0, actions) critic_tf = denormalize(tf.clip_by_value(normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms) actor_tf = self.actor(normalized_obs0) normalized_critic_with_actor_tf = self.critic(normalized_obs0, actor_tf) critic_with_actor_tf = denormalize(tf.clip_by_value(normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms) stats = {} if self.normalize_returns: stats['ret_rms_mean'] = self.ret_rms.mean stats['ret_rms_std'] = self.ret_rms.std if self.normalize_observations: stats['obs_rms_mean'] = tf.reduce_mean(self.obs_rms.mean) stats['obs_rms_std'] = tf.reduce_mean(self.obs_rms.std) stats['reference_Q_mean'] = tf.reduce_mean(critic_tf) stats['reference_Q_std'] = reduce_std(critic_tf) stats['reference_actor_Q_mean'] = tf.reduce_mean(critic_with_actor_tf) stats['reference_actor_Q_std'] = reduce_std(critic_with_actor_tf) stats['reference_action_mean'] = tf.reduce_mean(actor_tf) stats['reference_action_std'] = reduce_std(actor_tf) if self.param_noise: perturbed_actor_tf = self.perturbed_actor(normalized_obs0) stats['reference_perturbed_action_mean'] = tf.reduce_mean(perturbed_actor_tf) stats['reference_perturbed_action_std'] = reduce_std(perturbed_actor_tf) stats.update(self.param_noise.get_stats()) return stats def adapt_param_noise(self, obs0): try: from mpi4py import MPI except ImportError: MPI = None if self.param_noise is None: return 0. mean_distance = self.get_mean_distance(obs0).numpy() if MPI is not None: mean_distance = MPI.COMM_WORLD.allreduce(mean_distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size() self.param_noise.adapt(mean_distance) return mean_distance @tf.function def get_mean_distance(self, obs0): # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation. update_perturbed_actor(self.actor, self.perturbed_adaptive_actor, self.param_noise.current_stddev) normalized_obs0 = tf.clip_by_value(normalize(obs0, self.obs_rms), self.observation_range[0], self.observation_range[1]) actor_tf = self.actor(normalized_obs0) adaptive_actor_tf = self.perturbed_adaptive_actor(normalized_obs0) mean_distance = tf.sqrt(tf.reduce_mean(tf.square(actor_tf - adaptive_actor_tf))) return mean_distance def reset(self): # Reset internal state after an episode is complete. if self.action_noise is not None: self.action_noise.reset() if self.param_noise is not None: update_perturbed_actor(self.actor, self.perturbed_actor, self.param_noise.current_stddev)
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, microbatch_size=None, mix_mode='nomix', mix_alpha=0.2, mix_beta=0.2, fix_representation=False, use_l2reg=False, l2reg_coeff=1e-4): self.sess = sess = get_session() if MPI is not None and comm is None: comm = MPI.COMM_WORLD with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess, mix_mode=mix_mode) else: train_model = policy(microbatch_size, nsteps, sess, mix_mode=mix_mode) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) # Interpolating the supervision if mix_mode == 'mixreg': # get coeff and indices coeff = train_model.coeff indices = train_model.indices other_indices = train_model.other_indices # mixup OLDNEGLOGPAC = coeff * tf.gather(OLDNEGLOGPAC, indices, axis=0) \ + (1 - coeff) * tf.gather( OLDNEGLOGPAC, other_indices, axis=0) OLDVPRED = coeff * tf.gather(OLDVPRED, indices, axis=0) \ + (1 - coeff) * tf.gather(OLDVPRED, other_indices, axis=0) R = coeff * tf.gather(R, indices, axis=0) \ + (1 - coeff) * tf.gather(R, other_indices, axis=0) ADV = coeff * tf.gather(ADV, indices, axis=0) \ + (1 - coeff) * tf.gather(ADV, other_indices, axis=0) A = tf.gather(A, indices, axis=0) elif mix_mode == 'mixobs': # get indices indices = train_model.indices # gather OLDNEGLOGPAC = tf.gather(OLDNEGLOGPAC, train_model.indices, axis=0) OLDVPRED = tf.gather(OLDVPRED, train_model.indices, axis=0) R = tf.gather(R, train_model.indices, axis=0) ADV = tf.gather(ADV, train_model.indices, axis=0) A = tf.gather(A, train_model.indices, axis=0) elif mix_mode == 'nomix': pass else: raise ValueError(f"Unknown mixing mode: {mix_mode} !") # Store the nodes to be recorded self.loss_names = [] self.stats_list = [] ############ CALCULATE LOSS ############ # Total loss = Policy gradient loss - entropy * entropy coefficient # + Value coefficient * value loss # Normalizing advantage ADV = (ADV - tf.reduce_mean(ADV)) / (reduce_std(ADV) + 1e-8) # Calculate the entropy entropy = tf.reduce_mean(train_model.pd.entropy()) # Calculate value loss vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate policy gradient loss neglogpac = train_model.pd.neglogp(A) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # Record some information approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) self.loss_names.extend([ 'total_loss', 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', ]) self.stats_list.extend([ loss, pg_loss, vf_loss, entropy, approxkl, clipfrac, ]) ############################################ ############ UPDATE THE PARAMETERS ############ # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') if use_l2reg: weight_params = [v for v in params if '/b' not in v.name] l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params]) self.loss_names.append('l2_loss') self.stats_list.append(l2_loss) loss = loss + l2_loss * l2reg_coeff if fix_representation: params = params[-4:] # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) # 4. Clip the gradient if required if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) ############################################### self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self._init_op = tf.variables_initializer(params) self._sync_param = lambda: sync_from_root(sess, params, comm=comm) self.mix_mode = mix_mode self.mix_alpha = mix_alpha # JAG: Add beta parameter self.mix_beta = mix_beta self.fix_representation = fix_representation self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.adv_gradient = act_model.adv_gradient self.initial_state = act_model.initial_state self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") # Exclude the random convolution layer from syncing global_variables = [ v for v in global_variables if 'randcnn' not in v.name ] if MPI is not None: sync_from_root(sess, global_variables, comm=comm)
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None, unsupType='action'): self.sess = sess = get_session() # icm parameters self.unsup = unsupType is not None predictor = None self.numaction = ac_space.n designHead = 'universe' with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) if self.unsup: with tf.variable_scope("predictor", reuse=tf.AUTO_REUSE): if 'state' in unsupType: self.local_ap_network = predictor = StatePredictor( ob_space, ac_space, designHead, unsupType) else: self.local_ap_network = predictor = StateActionPredictor( ob_space, ac_space, designHead) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # computing predictor loss predloss = None if self.unsup: if 'state' in unsupType: predloss = constants[ 'PREDICTION_LR_SCALE'] * predictor.forwardloss else: predloss = constants['PREDICTION_LR_SCALE'] * ( predictor.invloss * (1 - constants['FORWARD_LOSS_WT']) + predictor.forwardloss * constants['FORWARD_LOSS_WT']) # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if MPI is not None: self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) if self.unsup: predgrads_and_var = self.trainer.compute_gradients( predloss * 20.0, predictor.var_list) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # clip predictor gradients if self.unsup: predgrads, _ = zip(*predgrads_and_var) predgrads, _ = tf.clip_by_global_norm(predgrads, constants['GRAD_NORM_CLIP']) predgrads_and_var = list(zip(predgrads, predictor.var_list)) # combine the policy and predictor grads and vars grads_and_var = grads_and_var + predgrads_and_var # unzip the grads and var after adding predictor grads/vars grads, var = zip(*grads_and_var) # normalize gradients for logging predgrad_global_norm = tf.global_norm(predgrads) # normalize gradients for logging grad_global_norm = tf.global_norm(grads) self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'grad_global_norm' ] self.stats_list = [ pg_loss, vf_loss, entropy, approxkl, clipfrac, grad_global_norm ] if self.unsup: self.loss_names += [ 'predloss', 'pred_forwardloss', 'pred_invloss', 'predgrad_global_norm' ] self.stats_list += [ predloss, predictor.forwardloss, predictor.invloss, predgrad_global_norm ] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state # prediction bonus function for icm self.pred_bonus = predictor.pred_bonus self.pred_bonuses = predictor.pred_bonuses self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables) #pylint: disable=E1101
def __init__(self, *, network, env, lr=3e-4, cliprange=0.2, nsteps=128, nminibatches=4, noptepochs=4, ent_coef=0.0, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, mpi_rank_weight=1, comm=None, microbatch_size=None, load_path=None, **network_kwargs): """ Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies.py env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) nminibatches: int number of training minibatches per update. For recurrent policies.py, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update ent_coef: float policy entropy coefficient in the optimization objective vf_coef: float value function loss coefficient in the optimization objective gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. """ self.sess = sess = get_session() if MPI is not None and comm is None: comm = MPI.COMM_WORLD policy = build_policy(env, network, **network_kwargs) self.env = env if isinstance(lr, float): self.lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): self.cliprange = constfn(cliprange) else: assert callable(cliprange) self.nminibatches = nminibatches # if eval_env is not None: # eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) # Calculate the batch_size self.nenvs = self.env.num_envs self.nsteps = nsteps self.nbatch = self.nenvs * self.nsteps self.nbatch_train = self.nbatch // nminibatches self.noptepochs = noptepochs with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(self.nenvs, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(self.nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder( [None]) # action placeholder self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # ratio 裁剪量 # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.def_path_pre = os.path.dirname( os.path.abspath(__file__)) + '/tmp/' initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") if MPI is not None: sync_from_root(sess, global_variables, comm=comm) # pylint: disable=E1101 if load_path is not None: self.load_newest(load_path) # Instantiate the runner object self.runner = Runner(env=self.env, model=self, nsteps=nsteps, gamma=gamma, lam=lam)
class ICM(object): def __init__(self, ob_space, ac_space, max_grad_norm, beta, icm_lr_scale, idf): sess = get_session() #TODO find a better way input_shape = [ob_space.shape[0], ob_space.shape[1], ob_space.shape[2]] # input_shape = ob_space print("ICM state Input shape ", np.shape(input_shape), " ", input_shape) self.action_shape = 36 self.idf = idf # Placeholders self.state_ = phi_state = tf.placeholder(tf.float32, [None, *input_shape], name="icm_state") self.next_state_ = phi_next_state = tf.placeholder( tf.float32, [None, *input_shape], name="icm_next_state") self.action_ = action = tf.placeholder(tf.float32, [None], name="icm_action") # self.R = rewards = tf.placeholder(tf.float32, shape=[None], name="maxR") with tf.variable_scope('icm_model'): # Feature encoding # Aka pass state and next_state to create phi(state), phi(next_state) # state --> phi(state) print("Feature Encodding of phi state with shape :: ", self.state_) phi_state = self.feature_encoding(self.state_) with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): # next_state to phi(next_state) phi_next_state = self.feature_encoding(self.next_state_) # INVERSE MODEL if self.idf: pred_actions_logits, pred_actions_prob = self.inverse_model( phi_state, phi_next_state) # FORWARD MODEL pred_phi_next_state = self.forward_model(action, phi_state) # CALCULATE THE ICM LOSS # Inverse Loss LI # We calculate the cross entropy between our ât and at # Squeeze the labels (required) labels = tf.cast(action, tf.int32) print("prediction pred_actions_logits") if self.idf: self.inv_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pred_actions_logits, labels=labels), name="inverse_loss") # Foward Loss # LF = 1/2 || pred_phi_next_state - phi_next_state || # TODO 0.5 * ? self.forw_loss_axis = tf.reduce_mean(tf.square( tf.subtract(pred_phi_next_state, phi_next_state)), axis=-1, name="forward_loss_axis") self.forw_loss = tf.reduce_mean(tf.square( tf.subtract(pred_phi_next_state, phi_next_state)), name="forward_loss") # Todo predictor lr scale ? # ICM_LOSS = [(1 - beta) * LI + beta * LF ] * Predictor_Lr_scale if self.idf: self.icm_loss = ((1 - beta) * self.inv_loss + beta * self.forw_loss ) #* icm_lr_scale else: self.icm_loss = self.forw_loss #### # self.icm_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # print("ICM var list ::: " , self.icm_var_list) #### # # if max_grad_norm is not None : # t_icm_grads , _ = tf.clip_by_global_norm(self.icm_loss, constants['GRAD_NORM_CLIP'] ) # t_icm_grads_and_vars = list(zip(self.icm_loss , self.icm_var_list)) # print("\n\n\nit works \n\n\n") # # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters self.icm_params = tf.trainable_variables( 'icm_model') ## var_list same as ## testing phase self.predgrads = tf.gradients(self.icm_loss, self.icm_params) self.predgrads, _ = tf.clip_by_global_norm(self.predgrads, max_grad_norm) self.pred_grads_and_vars = list(zip(self.predgrads, self.icm_params)) ## testing phase # print("\n\nTrainable variables \n ",icm_params) # # 2. Build our trainer self.icm_trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=1e-3, epsilon=1e-5) # # 3. Calculate the gradients icm_grads_and_var = self.icm_trainer.compute_gradients( self.icm_loss, self.icm_params) # # t_grads_and_var = tf.gradients() icm_grads, icm_var = zip(*icm_grads_and_var) if max_grad_norm is not None: # # Clip the gradients (normalize) icm_grads, icm__grad_norm = tf.clip_by_global_norm( icm_grads, max_grad_norm) icm_grads_and_var = list(zip(icm_grads, icm_var)) # # zip aggregate each gradient with parameters associated # # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self._icm_train = self.icm_trainer.apply_gradients(icm_grads_and_var) if MPI.COMM_WORLD.Get_rank() == 0: print("Initialize") initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") # print("GLOBAL VARIABLES", global_variables) sync_from_root(sess, global_variables) #pylint: disable=E1101 # We use batch normalization to do feature normalization as explained in the paper # using the universe head, def feature_encoding(self, x): print("feature function called !!") x = tf.nn.elu( tf.layers.batch_normalization(conv2d(x, 8, 5, 4, "valid"))) print(x) x = tf.nn.elu( tf.layers.batch_normalization(conv2d(x, 16, 3, 2, "valid"))) print(x) x = tf.nn.elu( tf.layers.batch_normalization(conv2d(x, 32, 3, 2, "valid"))) print(x) x = tf.nn.elu( tf.layers.batch_normalization(conv2d(x, 64, 3, 2, "valid"))) print(x) x = tf.layers.flatten(x) x = tf.nn.elu(tf.contrib.layers.fully_connected(x, 256)) return x # Inverse Model # Given phi(state) and phi(next_state) returns the predicted action ât """ Parameters __________ action: The real action taken by our agent phi_state: The feature representation of our state generated by our feature_encoding function. phi_next_state: The feature representation of our next_state generated by our feature_encoding function. returns pred_actions_logits: the logits and pred_actions_prob: the probability distribution of our actions """ def inverse_model(self, phi_state, phi_next_state): # Concatenate phi(st) and phi(st+1) icm_inv_concatenate = tf.concat([phi_state, phi_next_state], 1) icm_inv_fc1 = tf.nn.relu(tf.layers.dense(icm_inv_concatenate, 256)) pred_actions_logits = tf.layers.dense(icm_inv_fc1, self.action_shape) pred_actions_prob = tf.nn.softmax(pred_actions_logits, dim=-1) return pred_actions_logits, pred_actions_prob # Foward Model # Given action and phi(st) must find pred_phi(st+1) """ Parameters __________ action: The action taken by our agent phi_state: The feature representation of our state generated by our feature_encoding function. phi_next_state: The feature representation of our next_state generated by our feature_encoding function. returns pred_phi_next_state: The feature representation prediction of our next_state. """ def forward_model(self, action, phi_state): # Concatenate phi_state and action action = tf.expand_dims( action, axis=1) # Expand dimension to be able to concatenate icm_forw_concatenate = tf.concat(axis=1, values=[phi_state, action]) # FC icm_forw_fc1 = tf.layers.dense(icm_forw_concatenate, 256) # FC (size of phi_state [1] aka the width) # size of 288 icm_forw_pred_next_state = tf.layers.dense( icm_forw_fc1, phi_state.get_shape()[1].value) return icm_forw_pred_next_state # Calculate intrinsic reward """ Parameters __________ phi_next_state: The feature representation of our next_state generated by our feature_encoding function. pred_phi_next_state: The feature representation prediction of our next_state. returns intrinsic_reward: The intrinsic reward """ def calculate_intrinsic_reward(self, state, next_state, action): # print("In the error function ") sess = tf.get_default_session() # print("passed states shape {} {} {} ".format(np.shape(state) , np.shape(next_state) , np.shape(action))) # passed states shape (2, 84, 84, 4) (2, 84, 84, 4) (2,) # print("action : {} , type {}".format(np.shape(action) , type(action))) nenvs = np.shape(state)[0] # print("nenvs ",nenvs) # tmp = [] # for i in range(nenvs) : # ac = [action[i]] # tmp.append(sess.run(self.forw_loss, # {self.state_: np.expand_dims(state[i,:,:,:], axis=0), self.next_state_: np.expand_dims(next_state[i,:,:,:],axis=0), # self.action_: ac } ) ) # print(" shape passed i {}, state {} , next_state {} , action _type {} , action {} ". # format(i, np.shape(np.expand_dims(state[i,:,:,:] , axis=0)) , np.shape(next_state[i,:,:,:]) , # type(np.array(action[i] )) , np.shape([action[i]]) ) ) # tmp = np.concatenate([sess.run(self.forw_loss, # {self.state_: np.expand_dims(state[i,:,:,:], axis=0), # self.next_state_: np.expand_dims(next_state[i,:,:,:],axis=0), self.action_: [action[i]]}) for i in range(nenvs)] , 0 ) # print("tmp : ", np.shape(tmp) ) error = sess.run(self.forw_loss_axis, { self.state_: state, self.next_state_: next_state, self.action_: action }) # print("orignal error + error with axis -1 ") # print(list(zip(tmp,error))) # print("orignal Error ",error) # error = error * 0.5 #np.dot(error , 0.5) # print("Return error ",error) # Return intrinsic reward return error def train_curiosity_model(self, states, next_states, actions): # , rewards): sess = tf.get_default_session() feed = { self.state_: states, self.next_state_: next_states, self.action_: actions } #, self.R :rewards } if self.idf: return sess.run((self.forw_loss, self.inv_loss, self.icm_loss, self._icm_train), feed_dict=feed) else: return sess.run((self.forw_loss, self.icm_loss, self._icm_train), feed_dict=feed)
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, microbatch_size=None, model_index=0): self.sess = sess = get_session() self.model_index = model_index if MPI is not None and comm is None: comm = MPI.COMM_WORLD with tf.variable_scope('ppo2_model%s' % model_index, reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess) else: train_model = policy(microbatch_size, nsteps, sess) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean( tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model%s' % model_index) # print("para",model_index,params) # 2. Build our trainer if comm is not None and comm.Get_size() > 1: self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5) else: self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac' ] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = functools.partial(save_trainable_variables, scope="ppo2_model%s" % model_index, sess=sess) self.load = functools.partial(load_trainable_variables, scope="ppo2_model%s" % model_index, sess=sess) initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") # print("global_variables",model_index,global_variables) if MPI is not None: sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101
def __init__(self, ob_space, ac_space, max_grad_norm, beta, icm_lr_scale, idf): sess = get_session() #TODO find a better way input_shape = [ob_space.shape[0], ob_space.shape[1], ob_space.shape[2]] # input_shape = ob_space print("ICM state Input shape ", np.shape(input_shape), " ", input_shape) self.action_shape = 36 self.idf = idf # Placeholders self.state_ = phi_state = tf.placeholder(tf.float32, [None, *input_shape], name="icm_state") self.next_state_ = phi_next_state = tf.placeholder( tf.float32, [None, *input_shape], name="icm_next_state") self.action_ = action = tf.placeholder(tf.float32, [None], name="icm_action") # self.R = rewards = tf.placeholder(tf.float32, shape=[None], name="maxR") with tf.variable_scope('icm_model'): # Feature encoding # Aka pass state and next_state to create phi(state), phi(next_state) # state --> phi(state) print("Feature Encodding of phi state with shape :: ", self.state_) phi_state = self.feature_encoding(self.state_) with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): # next_state to phi(next_state) phi_next_state = self.feature_encoding(self.next_state_) # INVERSE MODEL if self.idf: pred_actions_logits, pred_actions_prob = self.inverse_model( phi_state, phi_next_state) # FORWARD MODEL pred_phi_next_state = self.forward_model(action, phi_state) # CALCULATE THE ICM LOSS # Inverse Loss LI # We calculate the cross entropy between our ât and at # Squeeze the labels (required) labels = tf.cast(action, tf.int32) print("prediction pred_actions_logits") if self.idf: self.inv_loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pred_actions_logits, labels=labels), name="inverse_loss") # Foward Loss # LF = 1/2 || pred_phi_next_state - phi_next_state || # TODO 0.5 * ? self.forw_loss_axis = tf.reduce_mean(tf.square( tf.subtract(pred_phi_next_state, phi_next_state)), axis=-1, name="forward_loss_axis") self.forw_loss = tf.reduce_mean(tf.square( tf.subtract(pred_phi_next_state, phi_next_state)), name="forward_loss") # Todo predictor lr scale ? # ICM_LOSS = [(1 - beta) * LI + beta * LF ] * Predictor_Lr_scale if self.idf: self.icm_loss = ((1 - beta) * self.inv_loss + beta * self.forw_loss ) #* icm_lr_scale else: self.icm_loss = self.forw_loss #### # self.icm_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # print("ICM var list ::: " , self.icm_var_list) #### # # if max_grad_norm is not None : # t_icm_grads , _ = tf.clip_by_global_norm(self.icm_loss, constants['GRAD_NORM_CLIP'] ) # t_icm_grads_and_vars = list(zip(self.icm_loss , self.icm_var_list)) # print("\n\n\nit works \n\n\n") # # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters self.icm_params = tf.trainable_variables( 'icm_model') ## var_list same as ## testing phase self.predgrads = tf.gradients(self.icm_loss, self.icm_params) self.predgrads, _ = tf.clip_by_global_norm(self.predgrads, max_grad_norm) self.pred_grads_and_vars = list(zip(self.predgrads, self.icm_params)) ## testing phase # print("\n\nTrainable variables \n ",icm_params) # # 2. Build our trainer self.icm_trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=1e-3, epsilon=1e-5) # # 3. Calculate the gradients icm_grads_and_var = self.icm_trainer.compute_gradients( self.icm_loss, self.icm_params) # # t_grads_and_var = tf.gradients() icm_grads, icm_var = zip(*icm_grads_and_var) if max_grad_norm is not None: # # Clip the gradients (normalize) icm_grads, icm__grad_norm = tf.clip_by_global_norm( icm_grads, max_grad_norm) icm_grads_and_var = list(zip(icm_grads, icm_var)) # # zip aggregate each gradient with parameters associated # # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self._icm_train = self.icm_trainer.apply_gradients(icm_grads_and_var) if MPI.COMM_WORLD.Get_rank() == 0: print("Initialize") initialize() global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="") # print("GLOBAL VARIABLES", global_variables) sync_from_root(sess, global_variables) #pylint: disable=E1101