def set_tf(self, sess, expert_train_model, ob_space, ac_space, nenvs, nsteps): nact = ac_space.n nbatch = nenvs * nsteps self.A = tf.placeholder(tf.int32, [nbatch]) # actions self.D = tf.placeholder(tf.float32, [nbatch]) # dones self.R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns self.MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's self.LR = tf.placeholder(tf.float32, []) eps = 1e-6 #step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) # params = find_trainable_variables("model") # print("Params {}".format(len(params))) # for var in params: # print(var) # create polyak averaged model #ema = tf.train.ExponentialMovingAverage(alpha) #ema_apply_op = ema.apply(params) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i v = tf.reduce_sum(tf.stop_gradient(expert_train_model.pi) * expert_train_model.q, axis=-1) # shape is [nenvs * (nsteps + 1)] s_v = tf.reduce_sum(expert_train_model.pi * tf.stop_gradient(expert_train_model.q), axis=-1) v = strip(v, nenvs, nsteps, True) s_v = strip(s_v, nenvs, nsteps, True) # strip off last step #f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [expert_train_model.pi, expert_polyak_model.pi, expert_train_model.q]) fq = lambda var: strip(var, nenvs, nsteps) q_i = get_by_index(fq(expert_train_model.q), self.A) #v = tf.reduce_max(fq(expert_train_model.q), axis = 1) # one_hot_A = tf.one_hot(self.A, nact) # pi = fq(expert_train_model.pi) # loss_policy = tf.reduce_mean(tf.square(pi-one_hot_A)) # Get pi and q values for actions taken #v = strip(v, nenvs, nsteps, True) #loss_q = -tf.reduce_mean(q_i - tf.reshape(v, [nenvs * nsteps, 1])) loss_q = tf.nn.relu(tf.reduce_mean(v - q_i)) loss_policy = -tf.reduce_mean(s_v - tf.stop_gradient(q_i)) self.expert_loss = loss_q + loss_policy #self.expert_loss = loss_policy self.loss_q = loss_q self.loss_policy = loss_policy
def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta, scope, load_path, debug, policy_inputs): self.sess = sess self.nenv = nenvs self.policy_inputs = policy_inputs.copy() nact = ac_space.n nbatch = nenvs * nsteps eps = 1e-6 self.scope = scope with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): self.A = tf.placeholder(tf.int32, [nbatch], name="action") # actions self.D = tf.placeholder(tf.float32, [nbatch], name="dones") # dones self.R = tf.placeholder(tf.float32, [nbatch], name="rewards") # rewards, not returns self.MU = tf.placeholder(tf.float32, [nbatch, nact], name="mus") # mu's self.LR = tf.placeholder(tf.float32, [], name="lr") self.V_NEXT = tf.placeholder(tf.float32, [nbatch], name="value_next") # (by lzn: we revise goal-conditioned next value) if isinstance(ob_space, gym.spaces.Dict): self.obs_shape = ob_space.spaces['observation'].shape self.obs_dtype = ob_space.spaces['observation'].dtype else: self.obs_shape = ob_space.shape self.obs_dtype = ob_space.dtype self.achieved_goal_sh = achieved_goal_sh = ACHIEVED_GOAL_SHAPE self.desired_goal_sh = desired_goal_sh = DESIRED_GOAL_SHAPE self.desired_goal_state_sh = desired_goal_state_sh = self.obs_shape self.step_obs_tf = tf.placeholder(self.obs_dtype, (nenvs,) + self.obs_shape, 'step_obs') self.step_achieved_goal_tf = tf.placeholder(tf.float32, (nenvs,) + achieved_goal_sh, 'step_achieved_goal') self.step_desired_goal_tf = tf.placeholder(tf.float32, (nenvs, ) + desired_goal_sh, 'step_desired_goal') self.step_desired_goal_state_tf = tf.placeholder(self.obs_dtype, (nenvs,) + desired_goal_state_sh, 'step_desired_goal_state') self.train_obs_tf = tf.placeholder(self.obs_dtype, (nenvs * nsteps,) + self.obs_shape, 'train_obs') self.train_achieved_goal_tf = tf.placeholder(tf.float32, (nenvs * nsteps,) + achieved_goal_sh, 'train_achieved_goal') self.train_desired_goal_tf = tf.placeholder(tf.float32, (nenvs * nsteps,) + desired_goal_sh, 'train_desired_goal') self.train_desired_goal_state_tf = tf.placeholder(self.obs_dtype, (nenvs * nsteps,) + desired_goal_state_sh, 'train_desired_goal_state') # normalize embedding normalizer = 2500 step_achieved_goal_tf = self.step_achieved_goal_tf / normalizer step_desired_goal_tf = self.step_desired_goal_tf / normalizer train_achieved_goal_tf = self.train_achieved_goal_tf / normalizer train_desired_goal_tf = self.train_desired_goal_tf / normalizer step_obs_tf = self.step_obs_tf step_desired_goal_state_tf = self.step_desired_goal_state_tf train_obs_tf = self.train_obs_tf train_desired_goal_state_tf = self.train_desired_goal_state_tf assert 'obs' in policy_inputs logger.info('policy_inputs:{}'.format(policy_inputs)) logger.info('achieved_goal_sh:{}'.format(self.achieved_goal_sh)) logger.info('desired_goal_sh:{}'.format(self.desired_goal_sh)) logger.info('normalizer:{}'.format(normalizer)) policy_inputs.remove('obs') if 'desired_goal_state' in policy_inputs: policy_inputs.remove('desired_goal_state') step_state_tf = tf.concat([step_obs_tf, step_desired_goal_state_tf], axis=-1, name='step_state') train_state_tf = tf.concat([train_obs_tf, train_desired_goal_state_tf], axis=-1, name='train_state') else: step_state_tf = step_obs_tf train_state_tf = train_obs_tf if 'achieved_goal' in policy_inputs and 'desired_goal' not in policy_inputs: policy_inputs.remove('achieved_goal') step_goal_tf = step_achieved_goal_tf train_goal_tf = train_achieved_goal_tf elif 'achieved_goal' not in policy_inputs and 'desired_goal' in policy_inputs: policy_inputs.remove('desired_goal') step_goal_tf = step_desired_goal_tf train_goal_tf = train_desired_goal_tf elif 'achieved_goal' in policy_inputs and 'desired_goal' in policy_inputs: policy_inputs.remove('achieved_goal') policy_inputs.remove('desired_goal') step_goal_tf = tf.concat([step_achieved_goal_tf, step_desired_goal_tf], axis=-1, name='step_goal') train_goal_tf = tf.concat([train_achieved_goal_tf, train_desired_goal_tf], axis=-1, name='train_goal') else: step_goal_tf, train_goal_tf = None, None if len(policy_inputs) > 0: raise ValueError("Unused policy inputs:{}".format(policy_inputs)) self.step_model = policy(nbatch=nenvs, nsteps=1, state_placeholder=step_state_tf, sess=self.sess, goal_placeholder=step_goal_tf) self.train_model = policy(nbatch=nbatch, nsteps=nsteps, state_placeholder=train_state_tf, sess=self.sess, goal_placeholder=train_goal_tf, summary_stats=True) variables = find_trainable_variables self.params = params = variables(scope) logger.info("========================== {} =============================".format(scope)) for var in params: logger.info(var) logger.info("========================== {} =============================\n".format(scope)) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) # print("========================== Ema =============================") def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) # print(v.name) return v # print("========================== Ema =============================") with tf.variable_scope(scope, custom_getter=custom_getter, reuse=True): self.polyak_model = policy(nbatch=nbatch, nsteps=nsteps, state_placeholder=train_state_tf, goal_placeholder=train_goal_tf, sess=self.sess,) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # action probability distributions according to self.train_model, self.polyak_model and self.step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(self.train_model.pi) polyak_model_p = tf.nn.softmax(self.polyak_model.pi) self.step_model_p = tf.nn.softmax(self.step_model.pi) # (todo by lizn, use this to calculate next value) v = self.v = tf.reduce_sum(train_model_p * self.train_model.q, axis=-1) # shape is [nenvs * (nsteps)] # strip off last step # (todo by lizn, we don't need strip) f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, self.train_model.q]) # f, f_pol, q = map(lambda x: x, [train_model_p, polyak_model_p, self.train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, self.A) q_i = get_by_index(q, self.A) # Compute ratios for importance truncation rho = f / (self.MU + eps) rho_i = get_by_index(rho, self.A) # Calculate Q_retrace targets qret = q_retrace(self.R, self.D, q_i, self.V_NEXT, rho_i, nenvs, nsteps, gamma) # (todo by lizn, use new next state value) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(self.train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) # (todo by lzn: we do not need the strip the last one) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2) gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis=1) # IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]] * 2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) # Goal loss loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) # [nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = - f_pol / (f + eps) # [nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) # [nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g / ( nenvs * nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) # print("=========================== gards add ==============================") grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)] # print("=========================== gards add ==============================\n") avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=self.LR, decay=rprop_alpha, epsilon=rprop_epsilon) _policy_opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_policy_opt_op]): _train_policy = tf.group(ema_apply_op) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging self.run_ops_policy = [_train_policy, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads] self.names_ops_policy = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads'] if trust_region: self.run_ops_policy = self.run_ops_policy + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj] self.names_ops_policy = self.names_ops_policy + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'] self.names_ops_policy = [scope + "_" + x for x in self.names_ops_policy] # scope as prefix self.save = functools.partial(save_variables, sess=self.sess, variables=params) self.initial_state = self.step_model.initial_state # with tf.variable_scope('stats'): # with tf.variable_scope('achieved_goal'): # self.ag_stats = Normalizer(size=self.achieved_goal_sh[0], sess=self.sess) # with tf.variable_scope('desired_goal'): # self.g_stats = Normalizer(size=self.desired_goal_sh[0], sess=self.sess) if debug: tf.global_variables_initializer().run(session=self.sess) load_variables(load_path, self.params, self.sess) else: tf.global_variables_initializer().run(session=self.sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) # actions D = tf.placeholder(tf.float32, [nbatch]) # dones R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's LR = tf.placeholder(tf.float32, []) eps = 1e-6 step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True) params = find_trainable_variables("model") print("Params {}".format(len(params))) for var in params: print(var) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) print(v.name) return v with tf.variable_scope("", custom_getter=custom_getter, reuse=True): polyak_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i v = tf.reduce_sum(train_model.pi * train_model.q, axis=-1) # shape is [nenvs * (nsteps + 1)] # strip off last step f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model.pi, polyak_model.pi, train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, A) q_i = get_by_index(q, A) # Compute ratios for importance truncation rho = f / (MU + eps) rho_i = get_by_index(rho, A) # Calculate Q_retrace targets qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient( adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]) ) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2) gain_bc = tf.reduce_sum( logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis=1) #IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]] * 2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = -f_pol / ( f + eps ) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g / ( nenvs * nsteps ) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) grads = [ gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params) ] avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging run_ops = [ _train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ] names_ops = [ 'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: run_ops = run_ops + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] names_ops = names_ops + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] def train(obs, actions, rewards, dones, mus, states, masks, steps): cur_lr = lr.value_steps(steps) td_map = { train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks td_map[polyak_model.S] = states td_map[polyak_model.M] = masks return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) self.train = train self.save = save self.train_model = train_model self.step_model = step_model self.step = step_model.step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta): sess = get_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) # actions D = tf.placeholder(tf.float32, [nbatch]) # dones R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's LR = tf.placeholder(tf.float32, []) eps = 1e-6 step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape) train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape) with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE): step_model = policy(nbatch=nenvs, nsteps=1, observ_placeholder=step_ob_placeholder, sess=sess) train_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess) params = find_trainable_variables("acer_model") print("Params {}".format(len(params))) for var in params: print(var) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) print(v.name) return v with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True): polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # action probability distributions according to train_model, polyak_model and step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(train_model.pi) polyak_model_p = tf.nn.softmax(polyak_model.pi) step_model_p = tf.nn.softmax(step_model.pi) v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)] # strip off last step f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, A) q_i = get_by_index(q, A) # Compute ratios for importance truncation rho = f / (MU + eps) rho_i = get_by_index(rho, A) # Calculate Q_retrace targets qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2) gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f loss_bc= -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]]*2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)] avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads] names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads'] if trust_region: run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj] names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'] def train(obs, actions, rewards, dones, mus, states, masks, steps): cur_lr = lr.value_steps(steps) td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks td_map[polyak_model.S] = states td_map[polyak_model.M] = masks return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train def _step(observation, **kwargs): return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs) self.train = train self.save = functools.partial(save_variables, sess=sess, variables=params) self.train_model = train_model self.step_model = step_model self._step = _step self.step = self.step_model.step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta, scope, goal_shape): self.sess = sess self.nenv = nenvs self.goal_shape = goal_shape nact = ac_space.n nbatch = nenvs * nsteps eps = 1e-6 self.scope = scope with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): self.A = tf.placeholder(tf.int32, [nbatch], name="action") # actions self.D = tf.placeholder(tf.float32, [nbatch], name="dones") # dones self.R = tf.placeholder(tf.float32, [nbatch], name="rewards") # rewards, not returns self.MU = tf.placeholder(tf.float32, [nbatch, nact], name="mus") # mu's self.LR = tf.placeholder(tf.float32, [], name="lr") step_ob_placeholder = tf.placeholder(ob_space.dtype, (nenvs, ) + ob_space.shape, "step_ob") step_goal_placeholder = tf.placeholder(tf.float32, (nenvs, ) + goal_shape, "step_goal") step_goal_encoded = step_goal_placeholder train_ob_placeholder = tf.placeholder( ob_space.dtype, (nenvs * (nsteps + 1), ) + ob_space.shape, "train_ob") train_goal_placeholder = tf.placeholder( tf.float32, (nenvs * (nsteps + 1), ) + goal_shape, "train_goal") train_goal_encoded = train_goal_placeholder concat_on_latent = False self.step_model = policy(nbatch=nenvs, nsteps=1, observ_placeholder=step_ob_placeholder, sess=self.sess, goal_placeholder=step_goal_placeholder, concat_on_latent=concat_on_latent, goal_encoded=step_goal_encoded) self.train_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=self.sess, goal_placeholder=train_goal_placeholder, concat_on_latent=concat_on_latent, goal_encoded=train_goal_encoded) variables = find_trainable_variables self.params = params = variables(scope) logger.info( "========================== {} =============================". format(scope)) for var in params: logger.info(var) logger.info( "========================== {} =============================\n". format(scope)) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) # print("========================== Ema =============================") def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) # print(v.name) return v # print("========================== Ema =============================") with tf.variable_scope(scope, custom_getter=custom_getter, reuse=True): self.polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, goal_placeholder=train_goal_placeholder, sess=self.sess, concat_on_latent=concat_on_latent, goal_encoded=train_goal_encoded) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # action probability distributions according to self.train_model, self.polyak_model and self.step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(self.train_model.pi) polyak_model_p = tf.nn.softmax(self.polyak_model.pi) self.step_model_p = tf.nn.softmax(self.step_model.pi) self.v = v = tf.reduce_sum(train_model_p * self.train_model.q, axis=-1) # shape is [nenvs * (nsteps + 1)] # strip off last step f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, self.train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, self.A) q_i = get_by_index(q, self.A) # Compute ratios for importance truncation rho = f / (self.MU + eps) rho_i = get_by_index(rho, self.A) # Calculate Q_retrace targets self.qret = qret = q_retrace(self.R, self.D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(self.train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient( adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]) ) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2) gain_bc = tf.reduce_sum( logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis=1) # IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]] * 2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) # Goal loss loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps * nenvs, f) # [nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = -f_pol / ( f + eps ) # [nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) # [nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g / ( nenvs * nsteps ) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) # print("=========================== gards add ==============================") grads = [ gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params) ] # print("=========================== gards add ==============================\n") avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=self.LR, decay=rprop_alpha, epsilon=rprop_epsilon) _policy_opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_policy_opt_op]): _train_policy = tf.group(ema_apply_op) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging self.run_ops_policy = [ _train_policy, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ] self.names_ops_policy = [ 'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: self.run_ops_policy = self.run_ops_policy + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] self.names_ops_policy = self.names_ops_policy + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] self.names_ops_policy = [ scope + "_" + x for x in self.names_ops_policy ] # scope as prefix self.save = functools.partial(save_variables, sess=self.sess, variables=params) self.initial_state = self.step_model.initial_state tf.global_variables_initializer().run(session=self.sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta, icm ): sess = get_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) # actions D = tf.placeholder(tf.float32, [nbatch]) # dones R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's LR = tf.placeholder(tf.float32, []) eps = 1e-6 step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape) train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape) with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE): step_model = policy(nbatch=nenvs , nsteps=1,observ_placeholder=step_ob_placeholder, sess=sess) train_model = policy(nbatch=nbatch , nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess) params = find_trainable_variables("acer_model") print("Params {}".format(len(params))) # for var in params: # print(var) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) print(v.name) return v with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True): polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # shape is [n_envs * (n_steps + 1)] # action probability distributions according to train_model, polyak_model and step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(train_model.pi) polyak_model_p = tf.nn.softmax(polyak_model.pi) step_model_p = tf.nn.softmax(step_model.pi) # train model policy probility and train model q value v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)] # strip off last step # dictribution_f , f_polyak, q_value f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, A) q_i = get_by_index(q, A) # Compute ratios for importance truncation rho = f / (MU + eps) rho_i = get_by_index(rho, A) # Calculate Q_retrace targets # passed # R = reward , D = done_ph , v = value ,... rest is same qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # f is distribution here # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) # v is value here check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling if icm is None : adv = qret - v # v is value here else : # print("Adv Normalization") # > Advantage Normalization adv = qret - v # m , s = get_mean_and_std(icm_adv) # advs = (icm_adv - m) / (s + 1e-7) # > Advantage Normalization logf = tf.log(f_i + eps) # c is correction term # importance weight clipping factor : c gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2) gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f loss_bc= -tf.reduce_mean(gain_bc) # IMP: This is sum, as expectation wrt f loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]]*2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)] avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) if icm is not None : # print("with ICM") grads = grads + icm.pred_grads_and_vars trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging if icm is not None : # print("With ICM") run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads , icm.forw_loss , icm.inv_loss, icm.icm_loss] names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ,'icm.forw_loss' , 'icm.inv_loss', 'icm.icm_loss' ] if trust_region: run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] else : run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ] names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] def train(obs, actions, rewards, dones, mus, states, masks, steps, next_states, icm_actions ): cur_lr = lr.value_steps(steps) if icm is not None : print("with ICM ") td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr , icm.state_:obs, icm.next_state_ : next_states , icm.action_ : icm_actions} else : td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks td_map[polyak_model.S] = states td_map[polyak_model.M] = masks return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train def _step(observation, **kwargs): return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs) self.train = train self.save = functools.partial(save_variables, sess=sess, variables=params) self.train_model = train_model self.step_model = step_model self._step = _step self.step = self.step_model.step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, n_envs, n_steps, n_stack, num_procs, ent_coef, q_coef, gamma, max_grad_norm, learning_rate, rprop_alpha, rprop_epsilon, total_timesteps, lr_schedule, correction_term, trust_region, alpha, delta): """ The ACER (Actor-Critic with Experience Replay) model class, https://arxiv.org/abs/1611.01224 :param policy: (AcerPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param ob_space: (Gym Space) The observation space :param ac_space: (Gym Space) The action space :param n_envs: (int) The number of environments :param n_steps: (int) The number of steps to run for each environment :param n_stack: (int) The number of stacked frames :param num_procs: (int) The number of threads for TensorFlow operations :param ent_coef: (float) The weight for the entropic loss :param q_coef: (float) The weight for the loss on the Q value :param gamma: (float) The discount value :param max_grad_norm: (float) The clipping value for the maximum gradient :param learning_rate: (float) The initial learning rate for the RMS prop optimizer :param rprop_alpha: (float) RMS prop optimizer decay rate :param rprop_epsilon: (float) RMS prop optimizer epsilon :param total_timesteps: (int) The total number of timesteps for training the model :param lr_schedule: (str) The scheduler for a dynamic learning rate :param correction_term: (float) The correction term for the weights :param trust_region: (bool) Enable Trust region policy optimization loss :param alpha: (float) The decay rate for the Exponential moving average of the parameters :param delta: (float) trust region delta value """ config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) sess = tf.Session(config=config) n_act = ac_space.n n_batch = n_envs * n_steps action_ph = tf.placeholder(tf.int32, [n_batch]) # actions done_ph = tf.placeholder(tf.float32, [n_batch]) # dones reward_ph = tf.placeholder(tf.float32, [n_batch]) # rewards, not returns mu_ph = tf.placeholder(tf.float32, [n_batch, n_act]) # mu's learning_rate_ph = tf.placeholder(tf.float32, []) eps = 1e-6 step_model = policy(sess, ob_space, ac_space, n_envs, 1, n_stack, reuse=False) train_model = policy(sess, ob_space, ac_space, n_envs, n_steps + 1, n_stack, reuse=True) params = find_trainable_variables("model") print("Params {}".format(len(params))) for var in params: print(var) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) def custom_getter(getter, *args, **kwargs): val = ema.average(getter(*args, **kwargs)) print(val.name) return val with tf.variable_scope("", custom_getter=custom_getter, reuse=True): polyak_model = policy(sess, ob_space, ac_space, n_envs, n_steps + 1, n_stack, reuse=True) # Notation: (var) = batch variable, (var)s = sequence variable, (var)_i = variable index by action at step i value = tf.reduce_sum(train_model.policy * train_model.q_value, axis=-1) # shape is [n_envs * (n_steps + 1)] # strip off last step # f is a distribution, chosen to be Gaussian distributions # with fixed diagonal covariance and mean \phi(x) # in the paper distribution_f, f_polyak, q_value = map( lambda variables: strip(variables, n_envs, n_steps), [train_model.policy, polyak_model.policy, train_model.q_value]) # Get pi and q values for actions taken f_i = get_by_index(distribution_f, action_ph) q_i = get_by_index(q_value, action_ph) # Compute ratios for importance truncation rho = distribution_f / (mu_ph + eps) rho_i = get_by_index(rho, action_ph) # Calculate Q_retrace targets qret = q_retrace(reward_ph, done_ph, q_i, value, rho_i, n_envs, n_steps, gamma) # Calculate losses # Entropy entropy = tf.reduce_mean(calc_entropy_softmax(distribution_f)) # Policy Gradient loss, with truncated importance sampling & bias correction value = strip(value, n_envs, n_steps, True) check_shape([qret, value, rho_i, f_i], [[n_envs * n_steps]] * 4) check_shape([rho, distribution_f, q_value], [[n_envs * n_steps, n_act]] * 2) # Truncated importance sampling adv = qret - value log_f = tf.log(f_i + eps) gain_f = log_f * tf.stop_gradient( adv * tf.minimum(correction_term, rho_i)) # [n_envs * n_steps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q_value - tf.reshape(value, [n_envs * n_steps, 1]) ) # [n_envs * n_steps, n_act] log_f_bc = tf.log(distribution_f + eps) # / (f_old + eps) check_shape([adv_bc, log_f_bc], [[n_envs * n_steps, n_act]] * 2) gain_bc = tf.reduce_sum(log_f_bc * tf.stop_gradient( adv_bc * tf.nn.relu(1.0 - (correction_term / (rho + eps))) * distribution_f), axis=1) # IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[n_envs * n_steps]] * 2) explained_variance = q_explained_variance( tf.reshape(q_i, [n_envs, n_steps]), tf.reshape(qret, [n_envs, n_steps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: # [n_envs * n_steps, n_act] grad = tf.gradients( -(loss_policy - ent_coef * entropy) * n_steps * n_envs, distribution_f) # [n_envs * n_steps, n_act] # Directly computed gradient of KL divergence wrt f kl_grad = -f_polyak / (distribution_f + eps) k_dot_g = tf.reduce_sum(kl_grad * grad, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(kl_grad * grad, axis=-1) - delta) / (tf.reduce_sum(tf.square(kl_grad), axis=-1) + eps)) # [n_envs * n_steps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(kl_grad) avg_norm_g = avg_norm(grad) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) grad = grad - tf.reshape(adj, [n_envs * n_steps, 1]) * kl_grad grads_f = -grad / ( n_envs * n_steps ) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(distribution_f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) grads = [ gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params) ] avg_norm_grads_f = avg_norm(grads_f) * (n_steps * n_envs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=learning_rate_ph, decay=rprop_alpha, epsilon=rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) learning_rate = Scheduler(initial_value=learning_rate, n_values=total_timesteps, schedule=lr_schedule) # Ops/Summaries to run, and their names for logging run_ops = [ _train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, explained_variance, norm_grads ] names_ops = [ 'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: run_ops = run_ops + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] names_ops = names_ops + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] def train(obs, actions, rewards, dones, mus, states, masks, steps): cur_lr = learning_rate.value_steps(steps) td_map = { train_model.obs_ph: obs, polyak_model.obs_ph: obs, action_ph: actions, reward_ph: rewards, done_ph: dones, mu_ph: mus, learning_rate_ph: cur_lr } if len(states) != 0: td_map[train_model.states_ph] = states td_map[train_model.masks_ph] = masks td_map[polyak_model.states_ph] = states td_map[polyak_model.masks_ph] = masks return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train def save(save_path): session_params = sess.run(params) make_path(os.path.dirname(save_path)) joblib.dump(session_params, save_path) self.train = train self.save = save self.train_model = train_model self.step_model = step_model self.step = step_model.step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)