def create_surr(self): p_n = utils.slice_2d(self.policy.pi_theta, tf.range(0, self.N), self.action) p_n_old = utils.slice_2d(self.policy.pi_theta_old, tf.range(0, self.N), self.action) # Surrogate Loss self.surr_loss = -tf.reduce_mean(p_n / p_n_old * self.advantage) self.surr_loss_grad = utils.flatgrad(self.surr_loss, self.var_list)
def hessian_vector_product(p): def hvp_fn(): kl_grad_vector = flatgrad(kl_fn, self.model.trainable_variables) grad_vector_product = tf.reduce_sum(kl_grad_vector * p) return grad_vector_product fisher_vector_product = flatgrad( hvp_fn, self.model.trainable_variables).numpy() return fisher_vector_product + (self.cg_damping * p)
def create_functions(self): eps = config.EPS self.var_list = tf.trainable_variables() #print("Before Surr Ok !") self.create_surr() #self.KL = (tf.reduce_sum(self.policy.pi_theta_old * # tf.log((self.policy.pi_theta_old + eps) / # (self.policy.pi_theta + eps))) / self.Nf) self.KL = (tf.reduce_sum(self.policy.pi_theta * tf.log( (self.policy.pi_theta + eps) / (self.policy.pi_theta_old + eps)))) / self.Nf self.entropy = (tf.reduce_sum( -self.policy.pi_theta * tf.log(self.policy.pi_theta + eps)) / self.Nf) """ self.KL_firstfixed = tf.reduce_sum(tf.stop_gradient(self.policy.pi_theta)* tf.log(tf.stop_gradient(self.policy.pi_theta + eps) / (self.policy.pi_theta + eps))) / self.Nf """ self.KL_firstfixed = tf.reduce_sum(self.policy.pi_theta * tf.log( (self.policy.pi_theta + eps) / (tf.stop_gradient(self.policy.pi_theta + eps)))) / self.Nf self.KL_firstfixed_grad = tf.gradients(self.KL_firstfixed, self.var_list) shapes = map(utils.var_shape, self.var_list) start = 0 self.tangents = [] for shape in shapes: size = np.prod(shape) param = tf.reshape(self.flat_tangent[start:(start + size)], shape) self.tangents.append(param) start += size self.fisher_vect_prod = (utils.flatgrad([ tf.reduce_sum(g * t) for (g, t) in zip(self.KL_firstfixed_grad, self.tangents) ], self.var_list)) self.current_theta = utils.GetFlat(self.session, self.var_list) self.set_theta = utils.SetFromFlat(self.session, self.var_list) self.value_func = utils.ValueFunction(self.session) self.stats = [] self.saver = tf.train.Saver()
def hvp_fn(): kl_grad_vector = flatgrad(kl_fn, self.model.trainable_variables) grad_vector_product = tf.reduce_sum(kl_grad_vector * p) return grad_vector_product
def train_step(self, episode, obs_all, Gs_all, actions_all, action_probs_all, total_reward, best_reward, entropy, t0): def surrogate_loss(theta=None): if theta is None: model = self.model else: model = self.tmp_model assign_vars(self.tmp_model, theta) logits = model(obs) action_prob = tf.nn.softmax(logits) action_prob = tf.reduce_sum(actions_one_hot * action_prob, axis=1) old_logits = self.model(obs) old_action_prob = tf.nn.softmax(old_logits) old_action_prob = tf.reduce_sum(actions_one_hot * old_action_prob, axis=1).numpy() + 1e-8 prob_ratio = action_prob / old_action_prob # pi(a|s) / pi_old(a|s) loss = tf.reduce_mean( prob_ratio * advantage) + self.ent_coeff * entropy return loss def kl_fn(theta=None): if theta is None: model = self.model else: model = self.tmp_model assign_vars(self.tmp_model, theta) logits = model(obs) action_prob = tf.nn.softmax(logits).numpy() + 1e-8 old_logits = self.model(obs) old_action_prob = tf.nn.softmax(old_logits) return tf.reduce_mean( tf.reduce_sum(old_action_prob * tf.math.log(old_action_prob / action_prob), axis=1)) def hessian_vector_product(p): def hvp_fn(): kl_grad_vector = flatgrad(kl_fn, self.model.trainable_variables) grad_vector_product = tf.reduce_sum(kl_grad_vector * p) return grad_vector_product fisher_vector_product = flatgrad( hvp_fn, self.model.trainable_variables).numpy() return fisher_vector_product + (self.cg_damping * p) def conjugate_grad(Ax, b): """ Conjugate gradient algorithm (see https://en.wikipedia.org/wiki/Conjugate_gradient_method) """ x = np.zeros_like(b) r = b.copy( ) # Note: should be 'b - Ax(x)', but for x=0, Ax(x)=0. Change if doing warm start. p = r.copy() old_p = p.copy() r_dot_old = np.dot(r, r) for _ in range(self.cg_iters): z = Ax(p) alpha = r_dot_old / (np.dot(p, z) + 1e-8) old_x = x x += alpha * p r -= alpha * z r_dot_new = np.dot(r, r) beta = r_dot_new / (r_dot_old + 1e-8) r_dot_old = r_dot_new if r_dot_old < self.residual_tol: break old_p = p.copy() p = r + beta * p if np.isnan(x).any(): print("x is nan") print("z", np.isnan(z)) print("old_x", np.isnan(old_x)) print("kl_fn", np.isnan(kl_fn())) return x def linesearch(x, fullstep): fval = surrogate_loss(x) for (_n_backtracks, stepfrac) in enumerate( self.backtrack_coeff**np.arange(self.backtrack_iters)): xnew = x + stepfrac * fullstep newfval = surrogate_loss(xnew) kl_div = kl_fn(xnew) if np.isnan(kl_div): print("kl is nan") print("xnew", np.isnan(xnew)) print("x", np.isnan(x)) print("stepfrac", np.isnan(stepfrac)) print("fullstep", np.isnan(fullstep)) if kl_div <= self.delta and newfval >= 0: print("Linesearch worked at ", _n_backtracks) return xnew if _n_backtracks == self.backtrack_iters - 1: print("Linesearch failed.", kl_div, newfval) return x NBATCHES = len(obs_all) // self.BATCH_SIZE if len(obs_all) < self.BATCH_SIZE: NBATCHES += 1 for batch_id in range(NBATCHES): obs = obs_all[batch_id * self.BATCH_SIZE:(batch_id + 1) * self.BATCH_SIZE] Gs = Gs_all[batch_id * self.BATCH_SIZE:(batch_id + 1) * self.BATCH_SIZE] actions = actions_all[batch_id * self.BATCH_SIZE:(batch_id + 1) * self.BATCH_SIZE] action_probs = action_probs_all[batch_id * self.BATCH_SIZE:(batch_id + 1) * self.BATCH_SIZE] Vs = self.value_model(obs).numpy().flatten() # advantage = Gs advantage = Gs - Vs advantage = (advantage - advantage.mean()) / (advantage.std() + 1e-8) actions_one_hot = tf.one_hot(actions, self.envs[0].action_space.n, dtype="float64") policy_loss = surrogate_loss() policy_gradient = flatgrad(surrogate_loss, self.model.trainable_variables).numpy() step_direction = conjugate_grad(hessian_vector_product, policy_gradient) shs = .5 * step_direction.dot( hessian_vector_product(step_direction).T) lm = np.sqrt(shs / self.delta) + 1e-8 fullstep = step_direction / lm if np.isnan(fullstep).any(): print("fullstep is nan") print("lm", lm) print("step_direction", step_direction) print("policy_gradient", policy_gradient) oldtheta = flatvars(self.model).numpy() theta = linesearch(oldtheta, fullstep) if np.isnan(theta).any(): print("NaN detected. Skipping update...") else: assign_vars(self.model, theta) kl = kl_fn(oldtheta) history = self.value_model.fit(obs, Gs, epochs=5, verbose=0) value_loss = history.history["loss"][-1] print( f"Ep {episode}.{batch_id}: Rw_mean {total_reward} - Rw_best {best_reward} - PL {policy_loss} - VL {value_loss} - KL {kl} - epsilon {self.epsilon} - time {time.time() - t0}" ) if self.value_model: writer = self.writer with writer.as_default(): tf.summary.scalar("reward", total_reward, step=episode) tf.summary.scalar("best_reward", best_reward, step=episode) tf.summary.scalar("value_loss", value_loss, step=episode) tf.summary.scalar("policy_loss", policy_loss, step=episode) self.epsilon = self.epsilon_decay(self.epsilon)
def make_model(self): self.observation_size = self.observation_space.shape[0] self.action_size = np.prod(self.action_space.shape) self.hidden_size = 64 weight_init = tf.random_uniform_initializer(-0.05, 0.05) bias_init = tf.constant_initializer(0) config = tf.ConfigProto(device_count={'GPU': 0}) self.session = tf.Session(config=config) self.obs = tf.placeholder(tf.float32, [None, self.observation_size]) self.action = tf.placeholder(tf.float32, [None, self.action_size]) self.advantage = tf.placeholder(tf.float32, [None]) self.oldaction_dist_mu = tf.placeholder(tf.float32, [None, self.action_size]) self.oldaction_dist_logstd = tf.placeholder(tf.float32, [None, self.action_size]) with tf.variable_scope("policy"): h1 = utils.fully_connected(self.obs, self.observation_size, self.hidden_size, weight_init, bias_init, "policy_h1") h1 = tf.nn.relu(h1) h2 = utils.fully_connected(h1, self.hidden_size, self.hidden_size, weight_init, bias_init, "policy_h2") h2 = tf.nn.relu(h2) h3 = utils.fully_connected(h2, self.hidden_size, self.action_size, weight_init, bias_init, "policy_h3") action_dist_logstd_param = tf.Variable( (.01 * np.random.randn(1, self.action_size)).astype( np.float32), name="policy_logstd") # means for each action self.action_dist_mu = h3 # log standard deviations for each actions self.action_dist_logstd = tf.tile( action_dist_logstd_param, tf.stack((tf.shape(self.action_dist_mu)[0], 1))) batch_size = tf.shape(self.obs)[0] # what are the probabilities of taking self.action, given new and old distributions log_p_n = utils.gauss_log_prob(self.action_dist_mu, self.action_dist_logstd, self.action) log_oldp_n = utils.gauss_log_prob(self.oldaction_dist_mu, self.oldaction_dist_logstd, self.action) # tf.exp(log_p_n) / tf.exp(log_oldp_n) ratio = tf.exp(log_p_n - log_oldp_n) # importance sampling of surrogate loss (L in paper) surr = -tf.reduce_mean(ratio * self.advantage) var_list = tf.trainable_variables() batch_size_float = tf.cast(batch_size, tf.float32) # kl divergence and shannon entropy kl = utils.gauss_KL(self.oldaction_dist_mu, self.oldaction_dist_logstd, self.action_dist_mu, self.action_dist_logstd) / batch_size_float ent = utils.gauss_ent(self.action_dist_mu, self.action_dist_logstd) / batch_size_float self.losses = [surr, kl, ent] # policy gradient self.pg = utils.flatgrad(surr, var_list) # KL divergence w/ itself, with first argument kept constant. kl_firstfixed = utils.gauss_selfKL_firstfixed( self.action_dist_mu, self.action_dist_logstd) / batch_size_float # gradient of KL w/ itself grads = tf.gradients(kl_firstfixed, var_list) # what vector we're multiplying by self.flat_tangent = tf.placeholder(tf.float32, [None]) shapes = map(utils.var_shape, var_list) start = 0 tangents = [] for shape in shapes: size = np.prod(shape) param = tf.reshape(self.flat_tangent[start:(start + size)], shape) tangents.append(param) start += size # gradient of KL w/ itself * tangent gvp = [tf.reduce_sum(g * t) for (g, t) in zip(grads, tangents)] # 2nd gradient of KL w/ itself * tangent self.fvp = utils.flatgrad(gvp, var_list) # the actual parameter values self.gf = utils.GetFlat(self.session, var_list) # call this to set parameter values self.sff = utils.SetFromFlat(self.session, var_list) self.session.run(tf.global_variables_initializer()) # value function # self.vf = VF(self.session) self.vf = LinearVF() self.get_policy = utils.GetPolicyWeights(self.session, var_list)