def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs * nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): step_model = policy(nenvs, 1, sess) train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("a2c_model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta, scope, load_path, debug, policy_inputs): self.sess = sess self.nenv = nenvs self.policy_inputs = policy_inputs.copy() nact = ac_space.n nbatch = nenvs * nsteps eps = 1e-6 self.scope = scope with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): self.A = tf.placeholder(tf.int32, [nbatch], name="action") # actions self.D = tf.placeholder(tf.float32, [nbatch], name="dones") # dones self.R = tf.placeholder(tf.float32, [nbatch], name="rewards") # rewards, not returns self.MU = tf.placeholder(tf.float32, [nbatch, nact], name="mus") # mu's self.LR = tf.placeholder(tf.float32, [], name="lr") self.V_NEXT = tf.placeholder(tf.float32, [nbatch], name="value_next") # (by lzn: we revise goal-conditioned next value) if isinstance(ob_space, gym.spaces.Dict): self.obs_shape = ob_space.spaces['observation'].shape self.obs_dtype = ob_space.spaces['observation'].dtype else: self.obs_shape = ob_space.shape self.obs_dtype = ob_space.dtype self.achieved_goal_sh = achieved_goal_sh = ACHIEVED_GOAL_SHAPE self.desired_goal_sh = desired_goal_sh = DESIRED_GOAL_SHAPE self.desired_goal_state_sh = desired_goal_state_sh = self.obs_shape self.step_obs_tf = tf.placeholder(self.obs_dtype, (nenvs,) + self.obs_shape, 'step_obs') self.step_achieved_goal_tf = tf.placeholder(tf.float32, (nenvs,) + achieved_goal_sh, 'step_achieved_goal') self.step_desired_goal_tf = tf.placeholder(tf.float32, (nenvs, ) + desired_goal_sh, 'step_desired_goal') self.step_desired_goal_state_tf = tf.placeholder(self.obs_dtype, (nenvs,) + desired_goal_state_sh, 'step_desired_goal_state') self.train_obs_tf = tf.placeholder(self.obs_dtype, (nenvs * nsteps,) + self.obs_shape, 'train_obs') self.train_achieved_goal_tf = tf.placeholder(tf.float32, (nenvs * nsteps,) + achieved_goal_sh, 'train_achieved_goal') self.train_desired_goal_tf = tf.placeholder(tf.float32, (nenvs * nsteps,) + desired_goal_sh, 'train_desired_goal') self.train_desired_goal_state_tf = tf.placeholder(self.obs_dtype, (nenvs * nsteps,) + desired_goal_state_sh, 'train_desired_goal_state') # normalize embedding normalizer = 2500 step_achieved_goal_tf = self.step_achieved_goal_tf / normalizer step_desired_goal_tf = self.step_desired_goal_tf / normalizer train_achieved_goal_tf = self.train_achieved_goal_tf / normalizer train_desired_goal_tf = self.train_desired_goal_tf / normalizer step_obs_tf = self.step_obs_tf step_desired_goal_state_tf = self.step_desired_goal_state_tf train_obs_tf = self.train_obs_tf train_desired_goal_state_tf = self.train_desired_goal_state_tf assert 'obs' in policy_inputs logger.info('policy_inputs:{}'.format(policy_inputs)) logger.info('achieved_goal_sh:{}'.format(self.achieved_goal_sh)) logger.info('desired_goal_sh:{}'.format(self.desired_goal_sh)) logger.info('normalizer:{}'.format(normalizer)) policy_inputs.remove('obs') if 'desired_goal_state' in policy_inputs: policy_inputs.remove('desired_goal_state') step_state_tf = tf.concat([step_obs_tf, step_desired_goal_state_tf], axis=-1, name='step_state') train_state_tf = tf.concat([train_obs_tf, train_desired_goal_state_tf], axis=-1, name='train_state') else: step_state_tf = step_obs_tf train_state_tf = train_obs_tf if 'achieved_goal' in policy_inputs and 'desired_goal' not in policy_inputs: policy_inputs.remove('achieved_goal') step_goal_tf = step_achieved_goal_tf train_goal_tf = train_achieved_goal_tf elif 'achieved_goal' not in policy_inputs and 'desired_goal' in policy_inputs: policy_inputs.remove('desired_goal') step_goal_tf = step_desired_goal_tf train_goal_tf = train_desired_goal_tf elif 'achieved_goal' in policy_inputs and 'desired_goal' in policy_inputs: policy_inputs.remove('achieved_goal') policy_inputs.remove('desired_goal') step_goal_tf = tf.concat([step_achieved_goal_tf, step_desired_goal_tf], axis=-1, name='step_goal') train_goal_tf = tf.concat([train_achieved_goal_tf, train_desired_goal_tf], axis=-1, name='train_goal') else: step_goal_tf, train_goal_tf = None, None if len(policy_inputs) > 0: raise ValueError("Unused policy inputs:{}".format(policy_inputs)) self.step_model = policy(nbatch=nenvs, nsteps=1, state_placeholder=step_state_tf, sess=self.sess, goal_placeholder=step_goal_tf) self.train_model = policy(nbatch=nbatch, nsteps=nsteps, state_placeholder=train_state_tf, sess=self.sess, goal_placeholder=train_goal_tf, summary_stats=True) variables = find_trainable_variables self.params = params = variables(scope) logger.info("========================== {} =============================".format(scope)) for var in params: logger.info(var) logger.info("========================== {} =============================\n".format(scope)) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) # print("========================== Ema =============================") def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) # print(v.name) return v # print("========================== Ema =============================") with tf.variable_scope(scope, custom_getter=custom_getter, reuse=True): self.polyak_model = policy(nbatch=nbatch, nsteps=nsteps, state_placeholder=train_state_tf, goal_placeholder=train_goal_tf, sess=self.sess,) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # action probability distributions according to self.train_model, self.polyak_model and self.step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(self.train_model.pi) polyak_model_p = tf.nn.softmax(self.polyak_model.pi) self.step_model_p = tf.nn.softmax(self.step_model.pi) # (todo by lizn, use this to calculate next value) v = self.v = tf.reduce_sum(train_model_p * self.train_model.q, axis=-1) # shape is [nenvs * (nsteps)] # strip off last step # (todo by lizn, we don't need strip) f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, self.train_model.q]) # f, f_pol, q = map(lambda x: x, [train_model_p, polyak_model_p, self.train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, self.A) q_i = get_by_index(q, self.A) # Compute ratios for importance truncation rho = f / (self.MU + eps) rho_i = get_by_index(rho, self.A) # Calculate Q_retrace targets qret = q_retrace(self.R, self.D, q_i, self.V_NEXT, rho_i, nenvs, nsteps, gamma) # (todo by lizn, use new next state value) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(self.train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) # (todo by lzn: we do not need the strip the last one) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2) gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis=1) # IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]] * 2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) # Goal loss loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) # [nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = - f_pol / (f + eps) # [nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) # [nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g / ( nenvs * nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) # print("=========================== gards add ==============================") grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)] # print("=========================== gards add ==============================\n") avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=self.LR, decay=rprop_alpha, epsilon=rprop_epsilon) _policy_opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_policy_opt_op]): _train_policy = tf.group(ema_apply_op) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging self.run_ops_policy = [_train_policy, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads] self.names_ops_policy = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads'] if trust_region: self.run_ops_policy = self.run_ops_policy + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj] self.names_ops_policy = self.names_ops_policy + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'] self.names_ops_policy = [scope + "_" + x for x in self.names_ops_policy] # scope as prefix self.save = functools.partial(save_variables, sess=self.sess, variables=params) self.initial_state = self.step_model.initial_state # with tf.variable_scope('stats'): # with tf.variable_scope('achieved_goal'): # self.ag_stats = Normalizer(size=self.achieved_goal_sh[0], sess=self.sess) # with tf.variable_scope('desired_goal'): # self.g_stats = Normalizer(size=self.desired_goal_sh[0], sess=self.sess) if debug: tf.global_variables_initializer().run(session=self.sess) load_variables(load_path, self.params, self.sess) else: tf.global_variables_initializer().run(session=self.sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, v_mix_coef=0.5, max_grad_norm=0.5, lr_alpha=7e-4, lr_beta=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', r_ex_coef=1.0, r_in_coef=0.0, v_ex_coef=1.0): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch], 'A') R_EX = tf.placeholder(tf.float32, [nbatch], 'R_EX') ADV_EX = tf.placeholder(tf.float32, [nbatch], 'ADV_EX') RET_EX = tf.placeholder(tf.float32, [nbatch], 'RET_EX') V_MIX = tf.placeholder(tf.float32, [nbatch], 'V_MIX') DIS_V_MIX_LAST = tf.placeholder(tf.float32, [nbatch], 'DIS_V_MIX_LAST') COEF_MAT = tf.placeholder(tf.float32, [nbatch, nbatch], 'COEF_MAT') LR_ALPHA = tf.placeholder(tf.float32, [], 'LR_ALPHA') LR_BETA = tf.placeholder(tf.float32, [], 'LR_BETA') step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) r_mix = r_ex_coef * R_EX + r_in_coef * tf.reduce_sum( train_model.r_in * tf.one_hot(A, nact), axis=1) ret_mix = tf.squeeze( tf.matmul(COEF_MAT, tf.reshape(r_mix, [nbatch, 1])), [1]) + DIS_V_MIX_LAST adv_mix = ret_mix - V_MIX neglogpac = train_model.pd.neglogp(A) pg_mix_loss = tf.reduce_mean(adv_mix * neglogpac) v_mix_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_mix), ret_mix)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) policy_loss = pg_mix_loss - ent_coef * entropy + v_mix_coef * v_mix_loss policy_params = train_model.policy_params policy_grads = tf.gradients(policy_loss, policy_params) if max_grad_norm is not None: policy_grads, policy_grad_norm = tf.clip_by_global_norm( policy_grads, max_grad_norm) policy_grads_and_vars = list(zip(policy_grads, policy_params)) policy_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_ALPHA, decay=alpha, epsilon=epsilon) policy_train = policy_trainer.apply_gradients(policy_grads_and_vars) rmss = [policy_trainer.get_slot(var, 'rms') for var in policy_params] policy_params_new = {} for grad, rms, var in zip(policy_grads, rmss, policy_params): ms = rms + (tf.square(grad) - rms) * (1 - alpha) policy_params_new[ var.name] = var - LR_ALPHA * grad / tf.sqrt(ms + epsilon) policy_new = train_model.policy_new_fn(policy_params_new, ob_space, ac_space, nbatch, nsteps) neglogpac_new = policy_new.pd.neglogp(A) ratio_new = tf.exp(tf.stop_gradient(neglogpac) - neglogpac_new) pg_ex_loss = tf.reduce_mean(-ADV_EX * ratio_new) v_ex_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_ex), RET_EX)) intrinsic_loss = pg_ex_loss + v_ex_coef * v_ex_loss intrinsic_params = train_model.intrinsic_params intrinsic_grads = tf.gradients(intrinsic_loss, intrinsic_params) if max_grad_norm is not None: intrinsic_grads, intrinsic_grad_norm = tf.clip_by_global_norm( intrinsic_grads, max_grad_norm) intrinsic_grads_and_vars = list(zip(intrinsic_grads, intrinsic_params)) intrinsic_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_BETA, decay=alpha, epsilon=epsilon) intrinsic_train = intrinsic_trainer.apply_gradients( intrinsic_grads_and_vars) lr_alpha = Scheduler(v=lr_alpha, nvalues=total_timesteps, schedule=lrschedule) lr_beta = Scheduler(v=lr_beta, nvalues=total_timesteps, schedule=lrschedule) all_params = tf.global_variables() def train(obs, policy_states, masks, actions, r_ex, ret_ex, v_ex, v_mix, dis_v_mix_last, coef_mat): advs_ex = ret_ex - v_ex for step in range(len(obs)): cur_lr_alpha = lr_alpha.value() cur_lr_beta = lr_beta.value() td_map = { train_model.X: obs, policy_new.X: obs, A: actions, R_EX: r_ex, ADV_EX: advs_ex, RET_EX: ret_ex, V_MIX: v_mix, DIS_V_MIX_LAST: dis_v_mix_last, COEF_MAT: coef_mat, LR_ALPHA: cur_lr_alpha, LR_BETA: cur_lr_beta } if policy_states is not None: td_map[train_model.PS] = policy_states td_map[train_model.M] = masks return sess.run([entropy, policy_train, intrinsic_train], td_map)[0] def save(save_path): ps = sess.run(all_params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(all_params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.intrinsic_reward = step_model.intrinsic_reward self.init_policy_state = step_model.init_policy_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV * logpac) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape( train_model.vf)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, PG_LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta, scope, goal_shape): self.sess = sess self.nenv = nenvs self.goal_shape = goal_shape nact = ac_space.n nbatch = nenvs * nsteps eps = 1e-6 self.scope = scope with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): self.A = tf.placeholder(tf.int32, [nbatch], name="action") # actions self.D = tf.placeholder(tf.float32, [nbatch], name="dones") # dones self.R = tf.placeholder(tf.float32, [nbatch], name="rewards") # rewards, not returns self.MU = tf.placeholder(tf.float32, [nbatch, nact], name="mus") # mu's self.LR = tf.placeholder(tf.float32, [], name="lr") step_ob_placeholder = tf.placeholder(ob_space.dtype, (nenvs, ) + ob_space.shape, "step_ob") step_goal_placeholder = tf.placeholder(tf.float32, (nenvs, ) + goal_shape, "step_goal") step_goal_encoded = step_goal_placeholder train_ob_placeholder = tf.placeholder( ob_space.dtype, (nenvs * (nsteps + 1), ) + ob_space.shape, "train_ob") train_goal_placeholder = tf.placeholder( tf.float32, (nenvs * (nsteps + 1), ) + goal_shape, "train_goal") train_goal_encoded = train_goal_placeholder concat_on_latent = False self.step_model = policy(nbatch=nenvs, nsteps=1, observ_placeholder=step_ob_placeholder, sess=self.sess, goal_placeholder=step_goal_placeholder, concat_on_latent=concat_on_latent, goal_encoded=step_goal_encoded) self.train_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=self.sess, goal_placeholder=train_goal_placeholder, concat_on_latent=concat_on_latent, goal_encoded=train_goal_encoded) variables = find_trainable_variables self.params = params = variables(scope) logger.info( "========================== {} =============================". format(scope)) for var in params: logger.info(var) logger.info( "========================== {} =============================\n". format(scope)) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) # print("========================== Ema =============================") def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) # print(v.name) return v # print("========================== Ema =============================") with tf.variable_scope(scope, custom_getter=custom_getter, reuse=True): self.polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, goal_placeholder=train_goal_placeholder, sess=self.sess, concat_on_latent=concat_on_latent, goal_encoded=train_goal_encoded) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # action probability distributions according to self.train_model, self.polyak_model and self.step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(self.train_model.pi) polyak_model_p = tf.nn.softmax(self.polyak_model.pi) self.step_model_p = tf.nn.softmax(self.step_model.pi) self.v = v = tf.reduce_sum(train_model_p * self.train_model.q, axis=-1) # shape is [nenvs * (nsteps + 1)] # strip off last step f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, self.train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, self.A) q_i = get_by_index(q, self.A) # Compute ratios for importance truncation rho = f / (self.MU + eps) rho_i = get_by_index(rho, self.A) # Calculate Q_retrace targets self.qret = qret = q_retrace(self.R, self.D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(self.train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient( adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]) ) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2) gain_bc = tf.reduce_sum( logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis=1) # IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]] * 2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) # Goal loss loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps * nenvs, f) # [nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = -f_pol / ( f + eps ) # [nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) # [nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g / ( nenvs * nsteps ) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) # print("=========================== gards add ==============================") grads = [ gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params) ] # print("=========================== gards add ==============================\n") avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=self.LR, decay=rprop_alpha, epsilon=rprop_epsilon) _policy_opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_policy_opt_op]): _train_policy = tf.group(ema_apply_op) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging self.run_ops_policy = [ _train_policy, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ] self.names_ops_policy = [ 'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: self.run_ops_policy = self.run_ops_policy + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] self.names_ops_policy = self.names_ops_policy + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] self.names_ops_policy = [ scope + "_" + x for x in self.names_ops_policy ] # scope as prefix self.save = functools.partial(save_variables, sess=self.sess, variables=params) self.initial_state = self.step_model.initial_state tf.global_variables_initializer().run(session=self.sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', logdir=None): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs*nsteps ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=train_model.a0) entropy = tf.reduce_sum(cat_entropy(train_model.pi)) params = find_trainable_variables("model") tf.summary.histogram("vf", train_model.vf) tf.summary.histogram("R", R) if train_model.relaxed: pg_loss = tf.constant(0.0) oh_A = tf.one_hot(train_model.a0, ac_space.n) params = find_trainable_variables("model") policy_params = [v for v in params if "pi" in v.name] vf_params = [v for v in params if "vf" in v.name] entropy_grads = tf.gradients(entropy, policy_params) ddiff_loss = tf.reduce_sum(train_model.vf - train_model.vf_t) ddiff_grads = tf.gradients(ddiff_loss, policy_params) sm = tf.nn.softmax(train_model.pi) dlogp_dpi = oh_A * (1. - sm) + (1. - oh_A) * (-sm) pi_grads = -((tf.expand_dims(R, 1) - train_model.vf_t) * dlogp_dpi) pg_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads) pg_grads = [pg - dg for pg, dg in zip(pg_grads, ddiff_grads)] pi_param_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads) cv_grads = tf.concat([tf.reshape(p, [-1]) for p in pg_grads], 0) cv_grad_splits = tf.reduce_sum(tf.square(cv_grads)) vf_loss = cv_grad_splits * vf_coef cv_grads = tf.gradients(vf_loss, vf_params) policy_grads = [] for e_grad, p_grad, param in zip(entropy_grads, pg_grads, policy_params): grad = -e_grad * ent_coef + p_grad policy_grads.append(grad) grad_dict = {} for g, v in list(zip(policy_grads, policy_params))+list(zip(cv_grads, vf_params)): grad_dict[v] = g grads = [grad_dict[v] for v in params] print(grads) else: pg_loss = tf.reduce_sum((tf.stop_gradient(R) - tf.stop_gradient(train_model.vf)) * neglogpac) policy_params = [v for v in params if "pi" in v.name] pg_grads = tf.gradients(pg_loss, policy_params) vf_loss = tf.reduce_sum(mse(tf.squeeze(train_model.vf), R)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef grads = tf.gradients(loss, params) grads = list(zip(grads, params)) ema = tf.train.ExponentialMovingAverage(.99) all_policy_grads = tf.concat([tf.reshape(g, [-1]) for g in pg_grads], 0) all_policy_grads_sq = tf.square(all_policy_grads) apply_mean_op = ema.apply([all_policy_grads, all_policy_grads_sq]) em_mean = ema.average(all_policy_grads) em_mean_sq = ema.average(all_policy_grads_sq) em_var = em_mean_sq - tf.square(em_mean) em_log_var = tf.log(em_var + 1e-20) mlgv = tf.reduce_mean(em_log_var) for g, v in grads: print(v.name, g) tf.summary.histogram(v.name, v) tf.summary.histogram(v.name+"_grad", g) self.sum_op = tf.summary.merge_all() self.writer = tf.summary.FileWriter(logdir) trainer = tf.train.AdamOptimizer(learning_rate=LR, beta2=.99999) with tf.control_dependencies([apply_mean_op]): _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) self._step = 0 def train(obs, states, rewards, masks, u1, u2, values, summary=False): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X:obs, train_model.U1:u1, train_model.U2:u2, ADV:advs, R:rewards, LR:cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks if summary: sum_str, policy_loss, value_loss, policy_entropy, lv, _ = sess.run( [self.sum_op, pg_loss, vf_loss, entropy, mlgv, _train], td_map ) self.writer.add_summary(sum_str, self._step) else: policy_loss, value_loss, policy_entropy, lv, _ = sess.run( [pg_loss, vf_loss, entropy, mlgv, _train], td_map ) self._step += 1 return policy_loss, value_loss, policy_entropy, lv def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear', is_async=True): self.sess = sess = get_session() nbatch = nenvs * nsteps with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE): self.model = step_model = policy(nenvs, 1, sess=sess) self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) self.logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV*neglogpac) entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params=params = find_trainable_variables("acktr_model") self.grads_check = grads = tf.gradients(train_loss,params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm) # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads,params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr, VF_LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map ) return policy_loss, value_loss, policy_entropy self.train = train self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
class Model(object): def __init__(self, sess, policy, dynamics, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta, scope, goal_shape, residual): self.sess = sess self.nenv = nenvs self.residual = residual self.goal_shape = goal_shape self.goal_as_image = goal_as_image = len(goal_shape) == 3 if self.goal_as_image: assert self.goal_shape == ob_space.shape else: logger.info("normalize goal using RunningMeanStd") with tf.variable_scope("RunningMeanStd", reuse=tf.AUTO_REUSE): self.goal_rms = RunningMeanStd(epsilon=1e-4, shape=self.goal_shape) nact = ac_space.n nbatch = nenvs * nsteps eps = 1e-6 self.dynamics = dynamics self.scope = scope with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): self.A = tf.placeholder(tf.int32, [nbatch], name="action") # actions self.D = tf.placeholder(tf.float32, [nbatch], name="dones") # dones self.R = tf.placeholder(tf.float32, [nbatch], name="rewards") # rewards, not returns self.MU = tf.placeholder(tf.float32, [nbatch, nact], name="mus") # mu's self.LR = tf.placeholder(tf.float32, [], name="lr") self.V_NEXT = tf.placeholder(tf.float32, [nbatch], name="v_next") step_ob_placeholder = tf.placeholder(ob_space.dtype, (nenvs, ) + ob_space.shape, "step_ob") if self.dynamics.dummy: step_goal_placeholder, concat_on_latent, step_goal_encoded = None, None, None else: if goal_as_image: step_goal_placeholder = tf.placeholder( ob_space.dtype, (nenvs, ) + ob_space.shape, "step_goal") concat_on_latent, train_goal_encoded, step_goal_encoded = False, None, None else: step_goal_placeholder = tf.placeholder( tf.float32, (nenvs, ) + goal_shape, "step_goal") step_goal_encoded = tf.clip_by_value( (step_goal_placeholder - self.goal_rms.mean) / self.goal_rms.std, -5., 5.) train_ob_placeholder = tf.placeholder( ob_space.dtype, (nenvs * nsteps, ) + ob_space.shape, "train_ob") if self.dynamics.dummy: train_goal_placeholder, concat_on_latent, train_goal_encoded = None, None, None else: if goal_as_image: train_goal_placeholder = tf.placeholder( ob_space.dtype, (nenvs * nsteps, ) + ob_space.shape, "train_goal") concat_on_latent, train_goal_encoded = False, None else: train_goal_placeholder = tf.placeholder( tf.float32, (nenvs * nsteps, ) + goal_shape, "train_goal") concat_on_latent = True train_goal_encoded = tf.clip_by_value( (train_goal_placeholder - self.goal_rms.mean) / self.goal_rms.std, -5., 5.) self.step_model = policy(nbatch=nenvs, nsteps=1, observ_placeholder=step_ob_placeholder, sess=self.sess, goal_placeholder=step_goal_placeholder, concat_on_latent=concat_on_latent, goal_encoded=step_goal_encoded) self.train_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=self.sess, goal_placeholder=train_goal_placeholder, concat_on_latent=concat_on_latent, goal_encoded=train_goal_encoded) variables = find_trainable_variables self.params = params = variables(scope) logger.info( "========================== {} =============================". format(scope)) for var in params: logger.info(var) logger.info( "========================== {} =============================\n". format(scope)) logger.info( "======================={}: Aux & Dyna =========================". format(scope)) for var in self.dynamics.params: logger.info(var) logger.info( "======================={}: Aux & Dyna =========================\n" .format(scope)) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) # print("========================== Ema =============================") def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) # print(v.name) return v # print("========================== Ema =============================") with tf.variable_scope(scope, custom_getter=custom_getter, reuse=True): self.polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, goal_placeholder=train_goal_placeholder, sess=self.sess, concat_on_latent=concat_on_latent, goal_encoded=train_goal_encoded) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # action probability distributions according to self.train_model, self.polyak_model and self.step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(self.train_model.pi) polyak_model_p = tf.nn.softmax(self.polyak_model.pi) self.step_model_p = tf.nn.softmax(self.step_model.pi) v = self.v = tf.reduce_sum(train_model_p * self.train_model.q, axis=-1) # shape is [nenvs * (nsteps)] # strip off last step f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, self.train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, self.A) q_i = get_by_index(q, self.A) # Compute ratios for importance truncation rho = f / (self.MU + eps) rho_i = get_by_index(rho, self.A) # Calculate Q_retrace targets qret = q_retrace(self.R, self.D, q_i, self.V_NEXT, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(self.train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient( adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]) ) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2) gain_bc = tf.reduce_sum( logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis=1) # IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]] * 2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) # Goal loss loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps * nenvs, f) # [nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = -f_pol / ( f + eps ) # [nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) # [nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g / ( nenvs * nsteps ) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) # print("=========================== gards add ==============================") grads = [ gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params) ] # print("=========================== gards add ==============================\n") avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=self.LR, decay=rprop_alpha, epsilon=rprop_epsilon) _policy_opt_op = trainer.apply_gradients(grads) if not self.dynamics.dummy: _train_dynamics = trainer.minimize(self.dynamics.loss) self.run_ops_dynamics = [ _train_dynamics, self.dynamics.aux_loss, self.dynamics.dyna_loss, ] self.name_ops_dynamics = ["aux_loss", "dyna_loss"] # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_policy_opt_op]): _train_policy = tf.group(ema_apply_op) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging self.run_ops_policy = [ _train_policy, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ] self.names_ops_policy = [ 'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: self.run_ops_policy = self.run_ops_policy + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] self.names_ops_policy = self.names_ops_policy + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] self.names_ops_policy = [ scope + "_" + x for x in self.names_ops_policy ] # scope as prefix self.save = functools.partial(save_variables, sess=self.sess, variables=params) self.initial_state = self.step_model.initial_state tf.global_variables_initializer().run(session=self.sess) def train_policy(self, obs, next_obs, actions, rewards, dones, mus, states, masks, steps, goal_obs, verbose=False): cur_lr = self.lr.value_steps(steps) # 1. calculate v_{t+1} using obs_{t+1} and g_t td_map = {self.train_model.X: next_obs} if not self.dynamics.dummy: assert hasattr(self.train_model, "goals") if self.residual: td_map[self.train_model.goals] = goal_obs - next_obs else: td_map[self.train_model.goals] = goal_obs v_next = self.sess.run(self.v, feed_dict=td_map) # 2. use obs_t, goal_t, v_{t+1} to train policy td_map = { self.train_model.X: obs, self.polyak_model.X: obs, self.A: actions, self.R: rewards, self.D: dones, self.MU: mus, self.LR: cur_lr, self.V_NEXT: v_next } if not self.dynamics.dummy: assert hasattr(self.train_model, "goals") assert hasattr(self.polyak_model, "goals") if hasattr(self, "goal_rms"): self.goal_rms.update(goal_obs) if self.residual: td_map[self.train_model.goals] = goal_obs - obs td_map[self.polyak_model.goals] = goal_obs - obs else: td_map[self.train_model.goals] = goal_obs td_map[self.polyak_model.goals] = goal_obs if states is not None: td_map[self.train_model.S] = states td_map[self.train_model.M] = masks td_map[self.polyak_model.S] = states td_map[self.polyak_model.M] = masks if verbose: names_ops_policy = self.names_ops_policy.copy() values_ops_policy = self.sess.run(self.run_ops_policy, td_map)[1:] # strip off _train else: names_ops_policy = self.names_ops_policy.copy( )[:8] # not including trust region values_ops_policy = self.sess.run(self.run_ops_policy, td_map)[1:][:8] unimportant_key = ["loss_f", "loss_bc"] for name in names_ops_policy.copy(): for suffix in unimportant_key: if name.endswith(suffix): index = names_ops_policy.index(name) names_ops_policy.pop(index) values_ops_policy.pop(index) break return names_ops_policy, values_ops_policy def train_dynamics(self, obs, actions, next_obs, steps, nb_epoch=1): value_ops_dynamics = [] for epoch in range(nb_epoch): cur_lr = self.lr.value_steps(steps) td_map = { self.dynamics.obs: obs, self.dynamics.next_obs: next_obs, self.dynamics.ac: actions, self.LR: cur_lr } value = self.sess.run(self.run_ops_dynamics, td_map)[1:] value_ops_dynamics.append(value) value_ops_dynamics = np.asarray(value_ops_dynamics) value_ops_dynamics = list(np.mean(value_ops_dynamics, axis=0)) return self.name_ops_dynamics.copy(), value_ops_dynamics def step(self, observation, **kwargs): if self.residual and not self.dynamics.dummy: kwargs["goals"] = kwargs["goals"] - observation return self.step_model.evaluate( [self.step_model.action, self.step_model_p, self.step_model.state], observation, **kwargs)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, use_adda, adda_lr, adda_batch, seed, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear' ): # The epsilion and alpha mentioned here is for RMSProp sess = tf_util.make_session() nbatch = nenvs * nsteps # 16*20 nsteps set in learn() print('nbatch defined and size is ', nbatch) #A = tf.placeholder(tf.int32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) # This is your TD Target LR = tf.placeholder(tf.float32, []) #source_array = np.load('/misc/lmbraid18/raob/source_dataset.npy') # (100000, 84, 84, 1) #target_array = np.load('/misc/lmbraid18/raob/target_dataset.npy') # (100000, 84, 84, 1) print('adda_batch:', adda_batch) step_model = policy( sess, ob_space, ac_space, adda_batch, seed, nbatch=nenvs * 1, nsteps=1, reuse=False, use_adda=use_adda ) # nbatch = nenvs*nsteps, model for generating data, Take 1 step for each env train_model = policy( sess, ob_space, ac_space, adda_batch, seed, nbatch=nenvs * nsteps, nsteps=nsteps, reuse=True, use_adda=use_adda) # model for training using collected data print('Qf:', train_model.Qf.get_shape()) print('R:', R.get_shape()) ########################################################## RL ############################################################### ########### Loss for RL Part ################ loss = tf.reduce_sum(huber_loss( train_model.Qf - R)) # This is your TD Error (Prediction (320,) - TD Target (320,)) ############################################# ########### Optimizer for RL Part ########### params = find_trainable_variables( "model") # Returns a list of variable objects for RL Model grads = tf.gradients( loss, params ) #Calculate gradients of loss wrt params.Returns a list of sum(d_loss/d_param) for each param in params if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_vars = list(zip( grads, params)) # grads_and_vars is a list of (gradient, variable) pairs trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients( grads_and_vars ) # Returns an operation that applies the specified gradients. ############################################# ##################################################################################################################################### ############################################################ ADDA ############################################################## if use_adda: source_array = np.load('/misc/lmbraid18/raob/source_dataset.npy' ) # (100000, 84, 84, 1) target_array = np.load('/misc/lmbraid18/raob/target_dataset.npy' ) # (100000, 84, 84, 1) print('Size of Datasets: ', len(source_array), len(target_array)) # Initialize Iterators sess.run(train_model.source_iter_op, feed_dict={train_model.dataset_imgs: source_array}) sess.run(train_model.target_iter_op, feed_dict={train_model.dataset_imgs: target_array}) ########### Loss for DA Part ########### mapping_loss = tf.losses.sparse_softmax_cross_entropy( 1 - train_model.adversary_labels, train_model.adversary_logits) adversary_loss = tf.losses.sparse_softmax_cross_entropy( train_model.adversary_labels, train_model.adversary_logits) ############################################# adversary_vars = find_trainable_variables( "adversary" ) # Returns a list of variable objects for Discriminator # extract vars used in target encoder for optimizing in DA part part_vars_names = ('model/c1/b', 'model/c1/w', 'model/c2/b', 'model/c2/w', 'model/c3/b', 'model/c3/w', 'model/fc1/b', 'model/fc1/w') target_vars = [ var for var in params if var.name[:-2] in part_vars_names ] ########### Optimizer for DA Part ########### da_lr_ph = tf.placeholder(tf.float32, []) #lr_var = tf.Variable(adda_lr, name='learning_rate', trainable=False) # Uncomment for constant LR optimizer = tf.train.RMSPropOptimizer( da_lr_ph) # da_lr_ph to lr_var for constant LR mapping_step = optimizer.minimize(mapping_loss, var_list=list(target_vars)) adversary_step = optimizer.minimize(adversary_loss, var_list=list(adversary_vars)) ############################################# print('########################') print(target_vars) print('########################') print('\n') print('########################') print(adversary_vars) print('########################') ##################################################################################################################################### lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Learning Rate Scheduling da_lr = Scheduler(v=adda_lr, nvalues=26e6, schedule=lrschedule) def train(obs, rewards, actions, update): for step in range(len(obs)): # len(obs) = 320 cur_lr = lr.value() ########### Run Session for RL Part ########### td_map = { train_model.X: obs, R: rewards, LR: cur_lr, train_model.A: actions } action_value_loss, _ = sess.run([loss, _train], td_map) ############################################# ########### Run Session for DA Part ########### # run DA losses in a session here. Start with running them after every update step. Later, condsider running after every 10 steps #if update > 62500: # If u want DA to run after 20e6 steps (20e6//320 = 62500) if (update > 125000) and (update % 5 == 0): #if update % 5 == 0: # Linearly reduce learning rate over RL batch size for step in range(len(obs)): cur_adda_lr = da_lr.value() # Update adda_lr feed_dict = {da_lr_ph: cur_adda_lr} mapping_loss_val, adversary_loss_val, _, _ = sess.run([ mapping_loss, adversary_loss, mapping_step, adversary_step ], feed_dict) if update % 3125 == 0: print('After {} Steps, DA LR is:{}'.format( update * 320, cur_adda_lr)) ############################################# return action_value_loss, cur_lr #, cur_adda_lr saver = tf.train.Saver(max_to_keep=100) part_vars_names = ('model/c1/b', 'model/c1/w', 'model/c2/b', 'model/c2/w', 'model/c3/b', 'model/c3/w', 'model/fc1/b', 'model/fc1/w') #part_vars_names = ('model/c1/b','model/c1/w','model/c2/b','model/c2/w','model/c3/b','model/c3/w') part_vars = [var for var in params if var.name[:-2] in part_vars_names] #print(part_vars) saver_adda = tf.train.Saver(part_vars) def save_model(save_step): #saver.save(sess, './hg_normal_with_da/MultiTexture/5steps_after_20e6/hg_normal_many_textures_with_da_model',global_step = save_step, write_meta_graph=False) #saver.save(sess, './hg_normal_with_da/MultiTexture/Seed 1/hg_normal_many_textures_with_da_model',global_step = save_step, write_meta_graph=False) saver.save( sess, '/misc/lmbraid18/raob/Snapshots_with_DA/Source/Small_High_Frequency_Updates/5steps_after_40e6/linearly_decrease_LR/Seed 2/hg_normal_5steps_40e6_decLR_model', global_step=save_step, write_meta_graph=False) #saver.save(sess, '/misc/lmbraid18/raob/Snapshots_no_DA/Multiple Snapshots/hg_normal_target_no_da/Seed 3/hg_normal_target_no_da_model', global_step = save_step, write_meta_graph=False) def load_model(snapshot, seed, adda_mode=False): # Load the saved parameters of the graph #if snapshot == 0: #saver.restore(sess, './hg_normal/hg_normal_model') #saver.restore(sess, '/misc/lmbraid18/raob/Snapshots_no_DA/Multiple Snapshots/hg_multiTexture_no_da/Seed 0/hg_multiTexture_no_da_model-66') saver.restore( sess, '/misc/lmbraid18/raob/Snapshots_no_DA/Multiple Snapshots/hg_normal_no_da/Seed 0/hg_normal_no_da_model-66' ) #saver.restore(sess, './hg_normal_many_textures/hg_normal_many_textures_model') #saver.restore(sess, '/misc/lmbraid18/raob/Snapshots_with_DA/Source/Small_High_Frequency_Updates/5steps_after_40e6/linearly_decrease_LR/Seed {}/hg_normal_5steps_40e6_decLR_model-{}'.format(seed, snapshot)) #saver.restore(sess, '/misc/lmbraid18/raob/Snapshots_with_DA/MultiTexture/Small High Frequency Updates/5steps_after_20e6/linearly decrease LR/Seed {}/hg_multiTexture_5steps_20e6_decLR_model-{}'.format(seed, snapshot)) #saver.restore(sess, '/misc/lmbraid18/raob/Snapshots_no_DA/Multiple Snapshots/hg_normal_no_da/Seed {}/hg_normal_no_da_model-{}'.format(seed, snapshot)) #saver.restore(sess, '/misc/lmbraid18/raob/Snapshots_no_DA/Multiple Snapshots/hg_multiTexture_no_da/Seed {}/hg_multiTexture_no_da_model-{}'.format(seed, snapshot)) #if snapshot > 0 and adda_mode: #saver_adda.restore(sess, './adda_doom_DA/hg_multiTexture_snapshots/2e-4/Seed {}/adda_doom_DA-{}'.format(seed, snapshot)) #saver_adda.restore(sess, './adda_doom_DA/hg_normal_snapshots/Seed {}/adda_doom_DA-{}'.format(seed, snapshot)) #saver_adda.restore(sess, './adda_doom_DA/hg_normal_many_textures_snapshots/Seed {}/adda_doom_DA-{}'.format(seed, snapshot)) #saver.restore(sess, './hg_normal_many_textures/hg_normal_many_textures_model') #print(sess.run('model/c1/b:0')) copy_op = step_model.get_copy_weights_operator() def update_target(): sess.run(copy_op) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.save_model = save_model self.load_model = load_model self.update_target = update_target tf.global_variables_initializer().run(session=sess) #var = [var for var in tf.global_variables() if var.op.name=="model/Qf/b"][0] #var_tar = [var for var in tf.global_variables() if var.op.name=="target_model/Qf_target/b"][0] def print_var(): print(sess.run(var)) print(sess.run(var_tar)) self.print_var = print_var
def __init__(self, policy, env, nsteps, icm,idf, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs*nsteps self.idf=idf print("This is Icm in Model Init function " , type(icm)) with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Value loss vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) if icm is not None : grads = grads + icm.pred_grads_and_vars # print("Gradients added ") # print("independetly there shape were a2c : {} icm :{} and together {} ".format(np.shape(grads),np.shape(icm.pred_grads_and_vars), # np.shape(grads_and_vars))) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values , next_obs ) : #, icm_rewards,cumulative_dicounted_icm): #, new_rew): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') # print(" icm called in train function ", type(icm)) advs = rewards - values # print("Now the advantage ", advs ) # icm_adv = icm_rewards - values # m , s = get_mean_and_std(icm_adv) # > adv Normaliztion # m , s = get_mean_and_std(advs) # advs = (advs - m) / (s + 1e-7) # advs = (icm_adv - m) / (s + 1e-7) # icm_adv = (icm_adv - icm_adv.mean()) / ( + 1e-7) # print("icm advantage ", icm_adv) # advs = new_rew - values # print("Advantage :", advs) # print("On train shapes are ") # print(" obs {} states {} rewards {} masks {} actions {} values {} ". # format(np.shape(obs) , np.shape(states) , np.shape(rewards) , np.shape(masks) ,np.shape(actions) , # np.shape(values) )) # print("Received Advantage {} rewards {} values {}".format( # advs , rewards , values) ) # print("advantage reward and values shape ") # print("advs {} , rewards shape {} , values {}".format(np.shape(advs) , np.shape(rewards) , np.shape(values))) for step in range(len(obs)): cur_lr = lr.value() if icm is None : td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} else : # print("curiosity Td Map ") # print(" obs {} , next obs {} , actions {} ".format(np.shape(obs) , np.shape(next_obs), # np.shape(actions))) td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr , icm.state_:obs, icm.next_state_ : next_obs , icm.action_ : actions }# , icm.R :rewards } if icm is None : if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy else : if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks if self.idf : policy_loss, value_loss, policy_entropy,forward_loss , inverse_loss , icm_loss, _ = sess.run( [pg_loss, vf_loss, entropy, icm.forw_loss , icm.inv_loss, icm.icm_loss ,_train], td_map) return policy_loss, value_loss, policy_entropy,forward_loss , inverse_loss , icm_loss, advs else : policy_loss, value_loss, policy_entropy,forward_loss , icm_loss, _ = sess.run( [pg_loss, vf_loss, entropy, icm.forw_loss , icm.icm_loss ,_train], td_map) return policy_loss, value_loss, policy_entropy,forward_loss , 0.0 , icm_loss, advs self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def __init__(self, model_template, num_options, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', option_eps=0.001, delib_cost=0.001): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) self.sess = sess self.rng = np.random.RandomState(0) # TODO nact = ac_space.n nbatch = nenvs * nsteps nopt = num_options self.option_eps = option_eps self.action_eps = epsilon batch_indexer = tf.range(nbatch) print("Building rest of the graph.") self.actions = tf.placeholder(shape=[nbatch], dtype=tf.int32) self.options = tf.placeholder(shape=[nbatch], dtype=tf.int32) self.rewards = tf.placeholder(shape=[nbatch], dtype=tf.float32) self.deliberation_costs = tf.placeholder(shape=[nbatch], dtype=tf.float32) self.lr = tf.placeholder(shape=[], dtype=tf.float32) summary = [] # Networks self.step_model = Network(model_template, nopt, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.train_model = Network(model_template, nopt, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) # Indexers self.responsible_options = tf.stack([batch_indexer, self.options], axis=1) self.responsible_actions = tf.stack([batch_indexer, self.actions], axis=1) self.network_indexer = tf.stack([self.options, batch_indexer], axis=1) # Q Values OVER options self.disconnected_q_vals = tf.stop_gradient( self.train_model.q_values_options) # Q values of each option that was taken self.responsible_opt_q_vals = tf.gather_nd( params=self.train_model.q_values_options, indices=self.responsible_options ) # Extract q values for each option self.disconnected_q_vals_option = tf.gather_nd( params=self.disconnected_q_vals, indices=self.responsible_options) # Termination probability of each option that was taken self.terminations = tf.gather_nd( params=self.train_model.termination_fn, indices=self.responsible_options) # Q values for each action that was taken relevant_networks = tf.gather_nd( params=self.train_model.intra_option_policies, indices=self.network_indexer) relevant_networks = tf.nn.softmax(relevant_networks, dim=1) self.action_values = tf.gather_nd(params=relevant_networks, indices=self.responsible_actions) # Weighted average value self.value = tf.reduce_max( self.train_model.q_values_options) * (1 - option_eps) + ( option_eps * tf.reduce_mean(self.train_model.q_values_options)) disconnected_value = tf.stop_gradient(self.value) # Losses; TODO: Why reduce sum vs reduce mean? self.value_loss = vf_coef * tf.reduce_mean( vf_coef * 0.5 * tf.square(self.rewards - self.responsible_opt_q_vals)) self.policy_loss = tf.reduce_mean( tf.log(self.action_values) * (self.rewards - self.disconnected_q_vals_option)) self.termination_loss = tf.reduce_mean( self.terminations * ((self.disconnected_q_vals_option - disconnected_value) + self.deliberation_costs)) action_probabilities = self.train_model.intra_option_policies self.entropy = ent_coef * tf.reduce_mean( action_probabilities * tf.log(action_probabilities)) self.loss = -self.policy_loss - self.entropy - self.value_loss - self.termination_loss # Gradients train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'model') gradients = tf.gradients(self.loss, train_vars) grads, grad_norms = tf.clip_by_global_norm(gradients, max_grad_norm) grads = list(zip(grads, train_vars)) trainer = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) self.apply_grads = trainer.apply_gradients(grads) # Summary avg_reward = tf.reduce_mean(self.rewards) summary.append(tf.summary.scalar('policy_loss', self.policy_loss)) summary.append(tf.summary.scalar('value_loss', self.value_loss)) summary.append( tf.summary.scalar('termination_loss', self.termination_loss)) summary.append(tf.summary.scalar('entropy', self.entropy)) summary.append(tf.summary.scalar('avg_reward', avg_reward)) self.summary_op = tf.summary.merge(summary) self.print_op = [ self.policy_loss, self.value_loss, self.termination_loss, avg_reward ] lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, options, actions, rewards, costs): feed_dict = { self.train_model.observations: obs, self.actions: actions, self.options: options, self.rewards: rewards, self.deliberation_costs: costs } train_ops = [self.apply_grads, self.summary_op, self.print_op] _, summary, summary_str = sess.run(train_ops, feed_dict=feed_dict) print(summary_str) return summary def setup_tensorflow(sess, writer): self.step_model.setup_tensorflow(sess, writer) self.train_model.setup_tensorflow(sess, writer) self.train = train self.setup_tensorflow = setup_tensorflow self.initial_state = self.step_model.initial_state self.step = self.step_model.step self.value = self.step_model.value self.update_options = self.step_model.update_options tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, diverse_r_coef=0.1, gamma=0.99, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs * nsteps with tf.variable_scope('vfo_model', reuse=tf.AUTO_REUSE): step_model = policy(nbatch=nenvs, nsteps=1, sess=sess) train_model = policy(nbatch=nbatch, nsteps=nsteps, sess=sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) params = find_trainable_variables('vfo_model') print(params) # ============================== # model-free actor-critic loss # ============================== with tf.variable_scope('mf_loss'): neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) # ============================== # diverse options policy loss # ============================== option_train_ops = [] option_losses = [] option_losses_names = [] option_distil_train_op = None with tf.variable_scope('options_loss'): diversity_reward = -1 * tf.nn.softmax_cross_entropy_with_logits_v2( labels=train_model.op_z, logits=train_model.option_discriminator) diversity_reward = tf.check_numerics( diversity_reward, 'Check numerics (1): diversity_reward') diversity_reward -= tf.log( tf.reduce_sum(train_model.prior_op_z * train_model.op_z) + 1e-6) print('d_reward:', diversity_reward.get_shape().as_list()) intrinsic_reward = tf.multiply( train_model.next_pvfs - train_model.pvfs, train_model.op_z) intrinsic_reward = tf.reduce_sum(intrinsic_reward, 1) print('i_reward:', intrinsic_reward.get_shape().as_list()) reward = diverse_r_coef * diversity_reward + intrinsic_reward with tf.variable_scope('critic'): next_vf = tf.reduce_sum( tf.multiply(train_model.next_pvfs, train_model.op_z), 1) print('next_vf:', next_vf.get_shape().as_list()) option_q_y = tf.stop_gradient(reward + (1 - train_model.dones) * gamma * next_vf) option_q = tf.squeeze(train_model.option_q, 1) print('option_q_y:', option_q_y.get_shape().as_list()) print('option_q:', option_q.get_shape().as_list()) option_q_loss = 0.5 * tf.reduce_mean( (option_q_y - option_q)**2) with tf.variable_scope('actor'): log_op_pi_t = train_model.option_pd.logp(A) log_target_t = tf.squeeze(train_model.option_q, 1) pvf = tf.reduce_sum( tf.multiply(train_model.pvfs, train_model.op_z), 1) print('op_pi:', log_op_pi_t.get_shape().as_list()) print('op_t:', log_target_t.get_shape().as_list()) print('pvf:', pvf.get_shape().as_list()) kl_surrogate_loss = tf.reduce_mean( log_op_pi_t * tf.stop_gradient(log_op_pi_t - log_target_t - pvf)) with tf.variable_scope('discriminator'): print('op_z:', train_model.op_z.get_shape().as_list()) print('op_dis:', train_model.option_discriminator.get_shape().as_list()) discriminator_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( labels=train_model.op_z, logits=train_model.option_discriminator_logits)) with tf.variable_scope('distillation'): # NOTE: to train distillation, op_z should be feed with q(z|s) print('mf_pi:', train_model.pi.get_shape().as_list()) print('op_pi:', train_model.option_pi.get_shape().as_list()) distillation_loss = losses.mean_squared_error( tf.stop_gradient(train_model.pi), train_model.option_pi) _train_option_q = tf.train.AdamOptimizer(lr).minimize( loss=option_q_loss, var_list=params) option_train_ops.append(_train_option_q) option_losses.append(option_q_loss) option_losses_names.append('option_critic') _train_option_policy = tf.train.AdamOptimizer(lr).minimize( loss=kl_surrogate_loss, var_list=params) option_train_ops.append(_train_option_policy) option_losses.append(kl_surrogate_loss) option_losses_names.append('option_actor') _train_option_disc = tf.train.AdamOptimizer(lr).minimize( loss=discriminator_loss, var_list=params) option_train_ops.append(_train_option_disc) option_losses.append(discriminator_loss) option_losses_names.append('option_discriminator') option_distil_train_op = tf.train.AdamOptimizer(lr).minimize( loss=distillation_loss, var_list=params) tf.summary.FileWriter(logger.get_dir(), sess.graph) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def train_options(obs, next_obs, states, next_states, masks, next_masks, actions, actions_full, dones, options_z): feed = { train_model.X: obs, train_model.X_next: next_obs, A: actions, train_model.ac: actions_full, train_model.dones: dones, train_model.op_z: options_z } if states is not None: feed[train_model.S] = states feed[train_model.next_S] = next_states feed[train_model.M] = masks feed[train_model.next_M] = next_masks record_loss_values = [] for name, loss, train_op in zip(option_losses_names, option_losses, option_train_ops): loss_value, _ = sess.run([loss, train_op], feed) record_loss_values.append((name + '_loss', loss_value)) return record_loss_values def distill_mf_to_options(obs, states, masks): feed = {train_model.X: obs} if states is not None: feed[train_model.S] = states feed[train_model.M] = masks option_ensembles = sess.run(train_model.option_discriminator, feed) feed[train_model.op_z] = option_ensembles distillation_loss_value, _ = sess.run( [distillation_loss, option_distil_train_op], feed) return distillation_loss_value self.train = train self.train_options = train_options self.distill_mf_to_options = distill_mf_to_options self.train_model = train_model self.prior_op_z = train_model.prior_op_z self.step_model = step_model self.step = step_model.step self.option_step = step_model.option_step self.option_select = step_model.option_select self.selective_option_step = step_model.selective_option_step self.value = step_model.value self.proto_value = step_model.proto_value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(20e6), lrschedule='linear'): ''' sess = tf.get_default_session() nbatch = nenvs*nsteps step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) A = train_model.pdtype.sample_placeholder([nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) ''' # begin diff sess = tf.get_default_session() step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, reuse=True) L = tf.placeholder(tf.int32, [1]) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) # end diff neglogpac = train_model.pd.neglogp(A) # length max_episode_steps pg_loss = tf.reduce_mean(tf.slice(ADV * neglogpac, [0], L)) vf_loss = tf.reduce_mean(tf.slice(mse(tf.squeeze(train_model.vf), R), [0], L)) entropy = tf.reduce_mean(tf.slice(train_model.pd.entropy(), [0], L)) loss = pg_loss-entropy*ent_coef+vf_loss*vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values, length): advs = rewards-values for step in range(len(obs)): cur_lr = lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr, L:np.asarray([length])} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run([pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear', is_async=True): self.sess = sess = get_session() nbatch = nenvs * nsteps with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE): self.model = step_model = policy(nenvs, 1, sess=sess) self.model2 = train_model = policy(nenvs * nsteps, nsteps, sess=sess) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) self.logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV * neglogpac) entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac) sample_net = train_model.vf + tf.random_normal(tf.shape( train_model.vf)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("acktr_model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm) # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, PG_LR: cur_lr, VF_LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy self.train = train self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
class Model(object): def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta, scope, load_path, goal_shape): self.sess = sess self.nenv = nenvs self.nsteps = nsteps self.goal_shape = goal_shape nact = ac_space.n nbatch = nenvs * nsteps eps = 1e-6 self.scope = scope with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): self.A = tf.placeholder(tf.int32, [nbatch], name="action") # actions self.D = tf.placeholder(tf.float32, [nbatch], name="dones") # dones self.R = tf.placeholder(tf.float32, [nbatch], name="rewards") # rewards, not returns self.MU = tf.placeholder(tf.float32, [nbatch, nact], name="mus") # mu's self.LR = tf.placeholder(tf.float32, [], name="lr") self.AUX = tf.placeholder(tf.float32, [nbatch], name="aux") self.V_NEXT = tf.placeholder( tf.float32, [nbatch], name="value_next" ) # (by lzn: we revise goal-conditioned next value) step_ob_placeholder = tf.placeholder(ob_space.dtype, (nenvs, ) + ob_space.shape, "step_ob") step_goal_placeholder = tf.placeholder(tf.float32, (nenvs, ) + goal_shape, "step_goal") step_goal_encoded = step_goal_placeholder train_ob_placeholder = tf.placeholder( ob_space.dtype, (nenvs * nsteps, ) + ob_space.shape, "train_ob") train_goal_placeholder = tf.placeholder( tf.float32, (nenvs * nsteps, ) + goal_shape, "train_goal") train_goal_encoded = train_goal_placeholder concat_on_latent = False self.step_model = policy(nbatch=nenvs, nsteps=1, observ_placeholder=step_ob_placeholder, sess=self.sess, goal_placeholder=step_goal_placeholder, concat_on_latent=concat_on_latent, goal_encoded=step_goal_encoded) self.train_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=self.sess, goal_placeholder=train_goal_placeholder, concat_on_latent=concat_on_latent, goal_encoded=train_goal_encoded) variables = find_trainable_variables self.params = params = variables(scope) logger.info( "========================== {} =============================". format(scope)) for var in params: logger.info(var) logger.info( "========================== {} =============================\n". format(scope)) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) # print("========================== Ema =============================") def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) # print(v.name) return v # print("========================== Ema =============================") with tf.variable_scope(scope, custom_getter=custom_getter, reuse=True): self.polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, goal_placeholder=train_goal_placeholder, sess=self.sess, concat_on_latent=concat_on_latent, goal_encoded=train_goal_encoded) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # action probability distributions according to self.train_model, self.polyak_model and self.step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(self.train_model.pi) polyak_model_p = tf.nn.softmax(self.polyak_model.pi) self.step_model_p = tf.nn.softmax(self.step_model.pi) # (todo by lizn, use this to calculate next value) v = self.v = tf.reduce_sum(train_model_p * self.train_model.q, axis=-1) # shape is [nenvs * (nsteps)] # strip off last step # (todo by lizn, we don't need strip) f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, self.train_model.q]) # f, f_pol, q = map(lambda x: x, [train_model_p, polyak_model_p, self.train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, self.A) q_i = get_by_index(q, self.A) # Compute ratios for importance truncation rho = f / (self.MU + eps) rho_i = get_by_index(rho, self.A) # Calculate Q_retrace targets qret = q_retrace( self.R, self.D, q_i, self.V_NEXT, rho_i, nenvs, nsteps, gamma) # (todo by lizn, use new next state value) = q_retrace() # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(self.train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(self.AUX * cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) # (todo by lzn: we do not need the strip the last one) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient( adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]) ) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2) gain_bc = tf.reduce_sum( logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis=1) # IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]] * 2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) # Goal loss loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps * nenvs, f) # [nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = -f_pol / ( f + eps ) # [nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) # [nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g / ( nenvs * nsteps ) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) # print("=========================== gards add ==============================") grads = [ gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params) ] # print("=========================== gards add ==============================\n") avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=self.LR, decay=rprop_alpha, epsilon=rprop_epsilon) # trainer = tf.train.AdamOptimizer(learning_rate=self.LR) _policy_opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_policy_opt_op]): _train_policy = tf.group(ema_apply_op) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging self.run_ops_policy = [ _train_policy, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ] self.names_ops_policy = [ 'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: self.run_ops_policy = self.run_ops_policy + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] self.names_ops_policy = self.names_ops_policy + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] self.names_ops_policy = [ scope + "_" + x for x in self.names_ops_policy ] # scope as prefix self.save = functools.partial(save_variables, sess=self.sess, variables=params) self.load = functools.partial(load_variables, sess=self.sess, variables=params) self.initial_state = self.step_model.initial_state if load_path is not None: tf.global_variables_initializer().run(session=self.sess) logger.info("loading pretrained model from {}".format(load_path)) self.load(load_path) else: tf.global_variables_initializer().run(session=self.sess) def train_policy(self, obs, next_obs, actions, rewards, dones, mus, states, masks, steps, goal_obs, aux, verbose=False): cur_lr = self.lr.value_steps(steps) # 1. calculate v_{t+1} using obs_{t+1} and g_t td_map = {self.train_model.X: next_obs} assert hasattr(self.train_model, "goals") td_map[self.train_model.goals] = goal_obs v_next = self.sess.run(self.v, feed_dict=td_map) # 2. use obs_t, goal_t, v_{t+1} to train policy td_map = { self.train_model.X: obs, self.polyak_model.X: obs, self.A: actions, self.R: rewards, self.D: dones, self.MU: mus, self.LR: cur_lr, self.V_NEXT: v_next, self.AUX: aux } ########################################## debug = False if debug: self._test(obs, next_obs, actions, rewards, dones, mus, goal_obs) ############################################ assert hasattr(self.train_model, "goals") assert hasattr(self.polyak_model, "goals") if hasattr(self, "goal_rms"): self.goal_rms.update(goal_obs) td_map[self.train_model.goals] = goal_obs td_map[self.polyak_model.goals] = goal_obs if states is not None: td_map[self.train_model.S] = states td_map[self.train_model.M] = masks td_map[self.polyak_model.S] = states td_map[self.polyak_model.M] = masks if verbose: names_ops_policy = self.names_ops_policy.copy() values_ops_policy = self.sess.run(self.run_ops_policy, td_map)[1:] # strip off _train else: names_ops_policy = self.names_ops_policy.copy( )[:8] # not including trust region values_ops_policy = self.sess.run(self.run_ops_policy, td_map)[1:][:8] return names_ops_policy, values_ops_policy def step(self, observation, **kwargs): return self.step_model.evaluate( [self.step_model.action, self.step_model_p, self.step_model.state], observation, **kwargs) def _test(self, obs, next_obs, actions, rewards, dones, mus, goal_obs): _obs, _next_obs, _actions, _dones, _goals, _mus, _rewards = self.generate_fake( obs, next_obs, actions, dones, goal_obs, mus, rewards) td_map = dict() td_map[self.train_model.goals] = _goals td_map[self.train_model.X] = _next_obs v_next = self.sess.run(self.v, feed_dict=td_map) print("v_next", v_next) td_map[self.train_model.X] = _obs td_map[self.A] = _actions td_map[self.R] = _rewards td_map[self.MU] = _mus td_map[self.D] = _dones td_map[self.V_NEXT] = v_next print("------td map--------") print(td_map) print("------td map--------") print("-------q_iter--------") q_iter = self.sess.run(self.q_iter, feed_dict=td_map) print(q_iter) print("-------q_iter--------") print("-------q_iter_after--------") q_iter_after = self.sess.run(self.q_iter, feed_dict=td_map) print(q_iter_after) print("-------q_iter_after--------") print("--------rs---------") rs = self.sess.run(self.rs, feed_dict=td_map) print(rs) print("--------rs---------") q_i, rho_i, qret = self.sess.run([self.q_i, self.rho_i, self.qret], feed_dict=td_map) print("q_i", q_i) print("rho_i", rho_i) print("q_ret", qret) assert 0 def generate_fake(self, obs, next_obs, actions, dones, goals, mus, rewards): obs_new = np.random.randn(self.nenv, self.nsteps + 1, *obs.shape[1:]) _obs = obs_new[:, :-1].reshape((-1, ) + obs.shape[1:]) _next_obs = obs_new[:, 1:].reshape((-1, ) + next_obs.shape[1:]) _actions = np.ones_like(actions) _dones = dones _goals = np.zeros_like(goals) _mus = np.random.randn(*mus.shape) _mus = _mus / np.sum(_mus, axis=-1, keepdims=True) print(self.sess.run(self.params)) print("obs", obs) print("_obs", obs_new) _rewards = np.ones_like(rewards) return _obs, _next_obs, _actions, _dones, _goals, _mus, _rewards
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV*logpac) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params=params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss,params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads,params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', hparams=None): assert hparams != None hparams['_vf_coef'] = vf_coef # Create the session. sess = tf_util.make_session( per_process_gpu_memory_fraction=hparams.get('gpu_fraction', 0.25)) self.sess = sess # Copy hparams. self.hparams = hparams self.nenvs = nenvs self.nsteps = nsteps self.hparams['batch_size'] = nenvs * nsteps # Setup constants. nact = ac_space.n nbatch = nenvs * nsteps self.nbatch = nbatch nh, nw, nc = ob_space.shape ob_shape_train = (nbatch, nh, nw, nc) ob_shape_step = (nenvs, nh, nw, nc) # Setup placeholders. A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) TEACHER_C = tf.placeholder(tf.float32, []) DROPOUT_STRENGTH = tf.placeholder(tf.float32, [], name='DROPOUT_STRENGTH') self.DROPOUT_STRENGTH = DROPOUT_STRENGTH X_train = tf.placeholder(tf.float32, ob_shape_train, name='Ob_train') #obs X_step = tf.placeholder(tf.float32, ob_shape_step, name='Ob_step') #obs attention_truth = None step_hparams = copy.deepcopy(hparams) train_hparams = copy.deepcopy(hparams) # if self.hparams.get('fixed_dropout_noise'): # self.step_env_random = tf.get_variable( # shape=[nenvs, 7, 7, 1], # name='env_random', # initializer=tf.truncated_normal_initializer(), # trainable=False, # ) # self.train_env_random = tf.tile(tf.expand_dims(self.step_env_random, axis=0), multiples=[nsteps, 1, 1, 1, 1]) # self.train_env_random = tf.reshape( # tf.transpose(self.train_env_random, perm=[1, 0, 2, 3, 4]), # [nbatch, 7, 7, 1]) # step_hparams['_env_random'] = self.step_env_random # train_hparams['_env_random'] = self.train_env_random # train_hparams['_dropout_strength'] = DROPOUT_STRENGTH # step_hparams['_dropout_strength'] = DROPOUT_STRENGTH # Create the models. step_model = policy(sess, X_step, ob_space, ac_space, nenvs, 1, reuse=False, hparams=step_hparams) train_model = policy(sess, X_train, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True, hparams=train_hparams) if hparams.get('teacher_ckpt'): assert hparams.get('use_fixed_attention') or hparams.get( 'learn_attention_from_teacher') or hparams.get( 'do_joint_training') # Create the teacher, so that way we can use its attention weights # instead of learning how to do attention on our own. # step_teacher = self._create_sfmnet(X_step, reuse=False, is_step_model=True) train_teacher = self._create_object_segmentation_net( X_train, reuse=False, is_step_model=False, embedding=train_model.original_h if hparams['do_joint_training'] else None, ) train_attention_truth, train_attention_mask = self._get_attention_truth( train_teacher, is_step_model=False) # step_attention_truth = self._get_attention_truth(step_teacher, is_step_model=True) # if hparams.get('use_fixed_attention'): # step_hparams['_attention_truth'] = step_attention_truth # train_hparams['_attention_truth'] = train_attention_truth # if hparams.get('do_joint_training'): # step_hparams['_teacher_h3'] = step_teacher.conv3 # step_hparams['_teacher_h'] = step_teacher.embedding # train_hparams['_teacher_h3'] = train_teacher.conv3 # train_hparams['_teacher_h'] = train_teacher.embedding # if hparams.get('use_target_model'): # assert not hparams.get('do_joint_training') # target_hparams = copy.copy(train_hparams) # target_hparams['_policy_scope'] = 'target_model' # target_hparams['_src_scope'] = 'model' # target_model = policy(sess, X_step, ob_space, ac_space, nenvs, 1, reuse=False, hparams=target_hparams) # target_model.setup_copy_weights() # self.target_model = target_model scaled_images = tf.cast(train_model.X, tf.float32) / 255. print('scaled_images shape: {}'.format(scaled_images)) sfm_base = object_segmentation.ObjectSegmentationBase( frames=scaled_images, embedding=train_model.h) sfm_hparams = copy.deepcopy(hparams) sfm_hparams['batch_size'] = nenvs * nsteps tf.summary.image('frame0', tf.expand_dims(train_model.X[..., -2], axis=-1), max_outputs=1) tf.summary.image('frame1', tf.expand_dims(train_model.X[..., -1], axis=-1), max_outputs=1) # Create the loss function. def a2c_loss(pi, vf): neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) # ent_coef_mode = hparams.get('ent_coef_mode', 'default') # ent_coef_val = hparams.get('ent_coef_val', ent_coef) # if ent_coef_mode == 'default': # actual_ent_coef = ent_coef_val # elif ent_coef_mode == 'linear_teacher': # actual_ent_coef = ent_coef_val * TEACHER_C + ent_coef * (1 - TEACHER_C) # elif ent_coef_mode == 'additive_teacher': # actual_ent_coef = ent_coef_val + ent_coef_val * TEACHER_C # else: # raise Exception('unrecognized ent_coef_mode: {}'.format(ent_coef_mode)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef return loss, pg_loss, vf_loss, entropy loss, pg_loss, vf_loss, entropy = a2c_loss(train_model.pi, train_model.vf) # if hparams.get('dropout_data_aug_c'): # logged_augs = False # loss_c = 1.0 - hparams['num_dropout_models'] * hparams['dropout_data_aug_c'] # assert loss_c >= hparams['dropout_data_aug_c'] - 1e-5 # loss = loss_c * loss # for pi_noise, vf_noise in zip(train_model.pi_noises, train_model.vf_noises): # l2, pg2, vf2, entropy2 = a2c_loss(pi_noise, vf_noise) # loss += l2 * hparams['dropout_data_aug_c'] # if not logged_augs: # logged_augs = True # tf.summary.scalar('aug_loss', tf.reduce_mean(l2)) # tf.summary.scalar('aug_pgloss', tf.reduce_mean(pg2)) # tf.summary.scalar('aug_vfloss', tf.reduce_mean(vf2)) # tf.summary.scalar('aug_entropyloss', tf.reduce_mean(entropy2)) # print("ADDING DROPOUT DATA AUG") # if hasattr(train_model, 'noise_loss') and hparams.get('noise_loss_c'): # loss += train_model.noise_loss # print("ADDING NOISE LOSS") # tf.summary.image('frame0', tf.expand_dims(train_model.X[..., -2],-1), max_outputs=1) # tf.summary.image('frame1', tf.expand_dims(train_model.X[..., -1],-1), max_outputs=1) teacher_loss = 0.0 if hparams.get('teacher_ckpt') and hparams.get( 'learn_attention_from_teacher'): assert hparams.get('attention_20') or hparams.get( 'inverted_attention_20') # Load in the teacher. # teacher = sfmnet.SfmNet(hparams=sfm_hparams, sfm_base=sfm_base, is_teacher_network=True) # attention_loss = tf.nn.softmax_cross_entropy_with_logits( # labels=train_attention_truth, # logits=tf.reshape(train_model.attention_logits, [nbatch,-1]) # ) # print('attention_loss: {}'.format(attention_loss.get_shape())) # print('train_attention_mask: {}'.format(train_attention_mask.get_shape())) # attention_loss = attention_loss * train_attention_mask # attention_loss = tf.reduce_mean(attention_loss) # # for t in [5., 10., 20., 40., 75., 100., 200., 500., 1000.]: # # truth = tf.nn.softmax(coarse_masks / t) # # tf.summary.image('attention_truth_{}'.format(t), tf.reshape(truth, [nbatch, 7, 7, 1]), max_outputs=1) # tf.summary.scalar('attention_loss', attention_loss) # tf.summary.scalar('attention_teaching', tf.reduce_mean(train_attention_mask)) # teacher_loss = TEACHER_C * attention_loss tf.summary.scalar('teacher_c', TEACHER_C) truth, mask = self._get_attention_truth_20(train_teacher, is_step_model=False) tf.summary.image('attention_20_truth', tf.reshape(truth, [80, 20, 20, 1]), max_outputs=1) if hparams.get('attention_20'): attention_loss_20 = tf.nn.softmax_cross_entropy_with_logits( labels=truth, logits=tf.reshape(train_model.attention_logits_20, [-1, 400])) attention_loss_20 = tf.reduce_mean(attention_loss_20 * mask) tf.summary.scalar('attention_loss_20', attention_loss_20) tf.summary.scalar('attention_teaching_20', tf.reduce_mean(mask)) teacher_loss += TEACHER_C * attention_loss_20 if hparams.get('extrapath_attention_20'): print("EXTRAPATH ATTENTION!!!") attention_loss_20 = tf.nn.softmax_cross_entropy_with_logits( labels=truth, logits=tf.reshape( train_model.extrapath_attention_logits_20, [-1, 400])) attention_loss_20 = tf.reduce_mean(attention_loss_20 * mask) tf.summary.scalar('attention_loss_20', attention_loss_20) tf.summary.scalar('attention_teaching_20', tf.reduce_mean(mask)) teacher_loss += (-TEACHER_C) * attention_loss_20 # if hparams.get('learn_attention_from_pg'): # attention_logits = tf.reshape(train_model.attention_logits, [nbatch, 49]) # attention_actions = sample(attention_logits) # attention_actions = tf.stop_gradient(attention_actions) # attention_neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=attention_logits, labels=attention_actions) # attention_pg_loss = tf.reduce_mean(ADV * attention_neglogpac) # tf.summary.scalar('attention_pg_loss', attention_pg_loss) # loss += attention_pg_loss * hparams['learn_attention_from_pg'] # if hparams.get('teacher_ckpt') and hparams.get('learn_translation_from_teacher'): # with tf.variable_scope("model"): # with tf.variable_scope('object_translation'): # pred_translation = fc(train_model.h, 'obj_t', nh=2*self.hparams['k_obj'], init_scale=1.0) # pred_translation = tf.reshape(pred_translation, (-1, self.hparams['k_obj'], 2)) # teacher_translation = tf.stop_gradient(train_teacher.object_translation) # translation_loss = mse(pred_translation, teacher_translation) # translation_loss = tf.reduce_mean(translation_loss) # teacher_loss += TEACHER_C * translation_loss # tf.summary.scalar('translation_loss', translation_loss) if hparams['do_joint_training']: teacher_loss += tf.reduce_mean( train_teacher.transform_loss + train_teacher.mask_reg_loss) * TEACHER_C if hasattr(train_model, 'attention_logits_20'): # Want a low entropy distribution, so that we are focused on only a small part of the image per frame. reshaped_logits = tf.reshape(train_model.attention_logits_20, [-1, 400]) attention_entropy = tf.reduce_mean(cat_entropy(reshaped_logits)) teacher_loss -= hparams[ 'attention_entropy_c'] * attention_entropy * TEACHER_C tf.summary.scalar('attention_entropy', attention_entropy) if hasattr(train_model, 'extrapath_attention_logits_20'): # Want a low entropy distribution, so that we are focused on only a small part of the image per frame. reshaped_logits = tf.reshape( train_model.extrapath_attention_logits_20, [-1, 400]) attention_entropy = tf.reduce_mean(cat_entropy(reshaped_logits)) teacher_loss -= hparams[ 'attention_entropy_c'] * attention_entropy * TEACHER_C tf.summary.scalar('extrapath_attention_entropy', attention_entropy) # if hasattr(train_model, 'attention_weights_20'): # # Want this to be high entropy, so we are looking at different parts of the image on different images. # batch_logits = tf.reshape(tf.reduce_sum(train_model.attention_weights_20, axis=0), [1, 400]) # attention_entropy = tf.reduce_mean(cat_entropy_softmax(batch_logits)) # loss -= hparams['batch_entropy_c'] * attention_entropy # tf.summary.scalar('batch_entropy', attention_entropy) # if hparams['do_joint_training'] and False: # assert hparams.get('teacher_ckpt') # teacher_loss += TEACHER_C * train_teacher.total_loss # else: # sfm_loss = None # if hparams['do_flow_prediction']: # assert hparams.get('teacher_ckpt') # flow_truth_x, flow_truth_y = self._get_flow_truth(train_teacher) # predicted_flow = conv(train_model.flow_base, 'pred_flow', nf=4, rf=1, stride=1, trainable=True) # flow_pred_x = tf.reshape(predicted_flow[..., :2], [-1, 2]) # flow_pred_y = tf.reshape(predicted_flow[..., 2:], [-1, 2]) # flow_x_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=flow_truth_x, logits=flow_pred_x)) # flow_y_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=flow_truth_y, logits=flow_pred_y)) # flow_loss = flow_x_loss + flow_y_loss # # flow_error = tf.reduce_mean(mse(flow_truth, predicted_flow)) # teacher_loss += TEACHER_C * flow_loss * hparams['flow_error_c'] # flow_x_acc = tf.reduce_mean(tf.cast(tf.argmax(flow_pred_x, axis=-1) == flow_truth_x, tf.int32)) # flow_y_acc = tf.reduce_mean(tf.cast(tf.argmax(flow_pred_y, axis=-1) == flow_truth_y, tf.int32)) # # tf.summary.scalar('flow_error_if_predict_zeros', tf.reduce_mean(0.5 * tf.square(flow_truth))) # tf.summary.scalar('flow_x_loss', flow_x_loss) # tf.summary.scalar('flow_y_loss', flow_y_loss) # tf.summary.scalar('flow_x_acc', flow_x_acc) # tf.summary.scalar('flow_y_acc', flow_y_acc) # # tf.summary.image('predicted_flow_x', tf.expand_dims(predicted_flow[..., 0], axis=-1), max_outputs=1) # # tf.summary.image('predicted_flow_y', tf.expand_dims(predicted_flow[..., 1], axis=-1), max_outputs=1) self.train_writer = tf.summary.FileWriter( os.path.join(hparams['base_dir'], 'logs', hparams['experiment_name']), sess.graph) # TODO(vikgoel): when we don't need the teacher, we should ensure that we don't merge its summaries so that way # we don't need to execute that part of the graph. merged_summaries = tf.summary.merge_all() trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) def get_train_op(loss_op): params = find_trainable_variables("model") # Switch from GATE_NONE to GATE_GRAPH to enhance reproducibility. #grads = tf.gradients(loss, params) grads_and_params = trainer.compute_gradients( loss=loss_op, var_list=params, gate_gradients=tf.train.RMSPropOptimizer.GATE_GRAPH) grads = [x[0] for x in grads_and_params] params = [x[1] for x in grads_and_params] if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) return trainer.apply_gradients(grads) _fast_train = get_train_op(loss) _teacher_train = get_train_op(loss + teacher_loss) params = find_trainable_variables("model") print('*' * 20) print('chosen trainable variables') for p in params: print(p.name) print('*' * 20) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) self.lr = lr write_counter = 0 def train(obs, states, rewards, masks, actions, values): nonlocal write_counter if lr.n % hparams['target_model_update_frequency'] == 0 and hasattr( self, 'target_model'): print('COPYING WEIGHTS INTO TARGET MODEL') self.target_model.copy_weights() advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() # Smooth approximation: #teacher_decay_c = hparams['teacher_decay_c']#9.9e-6 # 2.5e-5 #teacher_c = 1.0 / (teacher_decay_c * lr.n + 1) #teacher_c = min(hparams['max_teacher_c'], teacher_c) if not hparams['use_extra_path']: lerp = float(lr.n) / 1e7 lerp = min(lerp, 1) teacher_c = hparams['max_teacher_c'] * (1. - lerp) else: teacher_c = 1 # Linear decay schedule # teacher_c = (hparams['teacher_cutoff_step'] - lr.n) / hparams['teacher_cutoff_step'] # teacher_c = max(teacher_c, 0) # # Lower bound on the decay # teacher_c = (1 - hparams['teacher_loss_c']) * teacher_c + hparams['teacher_loss_c'] _train = _fast_train if teacher_c == 0 else _teacher_train td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr, TEACHER_C: teacher_c } # td_map[DROPOUT_STRENGTH] = get_dropout_strength(hparams, lr.n) if self.hparams['teacher_ckpt'] and self.hparams[ 'do_joint_training']: td_map[train_teacher.mask_reg_c] = 1 #if states is not None: # td_map[train_model.S] = states # td_map[train_model.M] = masks ops = [pg_loss, vf_loss, entropy, _train] # if hparams.get('no_train_a2c'): # ops = ops[:-1] if 'attention' in hparams['policy']: ops.append(train_model.attention_weights_20) write_summaries = hparams.get( 'teacher_ckpt') or 'attention' in hparams['policy'] if write_summaries: if write_counter % 10 != 0: write_summaries = False write_counter += 1 if write_summaries: ops.append(merged_summaries) sess_results = sess.run(ops, td_map) policy_loss = sess_results[0] value_loss = sess_results[1] policy_entropy = sess_results[2] if write_summaries: summary = sess_results[-1] self.train_writer.add_summary(summary, lr.n) if 'attention' in hparams['policy']: attention_output = sess_results[-2 if write_summaries else -1] publish_attention_weights(attention_output[:5, ...]) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load # Initialize all of the variables in a deterministic order so that each experiment is reproducible. global_vars = tf.global_variables() global_vars = sorted(global_vars, key=lambda x: x.name) for var in global_vars: tf.variables_initializer([var]).run(session=sess) #tf.global_variables_initializer().run(session=sess) if hparams.get('teacher_ckpt'): # Load in the teacher AFTER doing the init so we don't overwrite the weights. restore_teacher_from_checkpoint(sess, hparams['teacher_ckpt'])
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', replay_lambda=1, ss_rate=1, replay_loss=None): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs*nsteps # If we have replay_loss, create replay buffer and stage buffer # Use this to enforce replay loss lower if replay_loss is not None: self.replay_buffer = [] # holds all past data A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) # Introduce replay_loss if given if replay_loss == "L2": # Replace train_model.pi with whatever is predicted label # Replace A with whatever is recorded label re_loss = tf.nn.l2_loss(tf.nn.softmax(train_model.pi) - A) / nbatch elif replay_loss == "Distillation": # Replace y_donor with whatever is recorded label # Replace y_acceptor with whatever is predicted label re_loss = tf.reduce_mean( - tf.reduce_sum(tf.stop_gradient(y_donor) * tf.log(y_acceptor), reduction_indices=1)) loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef if replay_loss is not None: loss = loss + replay_lambda*re_loss params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, master_ts = 1, worker_ts = 30, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, cell = 256, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', algo='regular', beta=1e-3): print('Create Session') gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) nact = ac_space.n nbatch = nenvs*master_ts*worker_ts A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, 1, cell = cell, model='step_model', algo=algo) train_model = policy(sess, ob_space, ac_space, nbatch, master_ts, worker_ts, model='train_model', algo=algo) print('model_setting_done') #loss construction neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.wvf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.wpi)) pg_loss = pg_loss - entropy * ent_coef print('algo: ', algo, 'max_grad_norm: ', str(max_grad_norm)) try: if algo == 'regular': loss = pg_loss + vf_coef * vf_loss elif algo == 'VIB': ''' implement VIB here, apart from the vf_loss and pg_loss, there should be a third loss, the kl_loss = ds.kl_divergence(model.encoding, prior), where prior is a Gaussian distribution with mu=0, std=1 the final loss should be pg_loss + vf_coef * vf_loss + beta*kl_loss ''' prior = ds.Normal(0.0, 1.0) kl_loss = tf.reduce_mean(ds.kl_divergence(train_model.encoding, prior)) loss = pg_loss + vf_coef * vf_loss + beta*kl_loss # pass else: raise Exception('Algorithm not exists') except Exception as e: print(e) grads, global_norm = grad_clip(loss, max_grad_norm, ['model']) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(wobs, whs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(whs)): cur_lr = lr.value() td_map = {train_model.wX:wobs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.wS] = states td_map[train_model.wM] = masks ''' you can add and run additional loss for VIB here for debugging, such as kl_loss ''' tloss, value_loss, policy_loss, policy_entropy, _ = sess.run( [loss, vf_loss, pg_loss, entropy, _train], feed_dict=td_map ) return tloss, value_loss, policy_loss, policy_entropy params = find_trainable_variables("model") def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.wvalue self.get_wh = step_model.get_wh self.initial_state = step_model.w_initial_state self.train = train self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef, vf_coef, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nbatch = nenvs * nsteps step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) A = train_model.pdtype.sample_placeholder([nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) eps = 1e-6 #nadv = ADV / (train_model.ret_rms.std + eps) #nr = (R - train_model.ret_rms.mean) / (train_model.ret_rms.std + eps) nadv = (ADV - train_model.ret_rms.mean) / (train_model.ret_rms.std + eps) nr = (R - train_model.ret_rms.mean) / (train_model.ret_rms.std + eps) nlogpac = -train_model.pd.logp(A) pg_loss = tf.reduce_mean(nadv * nlogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), nr)) #vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vnorm), nr)) entropy = tf.reduce_mean(train_model.pd.entropy()) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) avg_norm_ret = tf.reduce_mean(tf.abs(train_model.ret_rms.mean)) avg_norm_obs = tf.reduce_mean(tf.abs(train_model.ob_rms.mean)) def train(obs, states, returns, masks, actions, values): advs = returns - values #advs = (advs - np.mean(advs)) / (np.std(advs) + eps) for step in range(len(obs)): cur_lr = lr.value() if hasattr(train_model, "ob_rms"): train_model.ob_rms.update( sess, obs) # update running mean/std for observations of policy if hasattr(train_model, "ret_rms"): train_model.ret_rms.update( sess, returns) # # update running mean/std for returns td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks ravg_norm_obs, policy_loss, value_loss, policy_entropy, _ = sess.run( [avg_norm_obs, pg_loss, vf_loss, entropy, _train], td_map) return ravg_norm_obs, policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def learn_hoof_a2c( network, env, seed=None, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, load_path=None, # Baselines default settings till here optimiser='RMSProp', lr_upper_bound=None, ent_upper_bound=None, num_lr=None, num_ent_coeff=None, max_kl=-1.0, # -1.0 is for no KL constraint **network_kwargs): ''' Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. Parameters: ----------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int, total number of timesteps to train on (default: 80M) max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) gamma: float, reward discounting parameter (default: 0.99) log_interval: int, specifies how frequently the logs are printed out (default: 100) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # overwrite default params if using HOOF if lr_upper_bound is not None: lr = 1.0 lrschedule = 'constant' else: num_lr = 1 if ent_upper_bound is None: num_ent_coeff = 1 # Instantiate the model object (that creates step_model and train_model) model = HOOF_Model( policy=policy, env=env, nsteps=nsteps, optimiser=optimiser, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, total_timesteps=total_timesteps, alpha=alpha, epsilon=epsilon # defaults for RMSProp ) runner = HOOF_Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Calculate the batch_size nbatch = nenvs * nsteps # model helper functions model_params = find_trainable_variables("a2c_model") get_flat = U.GetFlat(model_params) set_from_flat = U.SetFromFlat(model_params) # for Gaussian policies def kl(new_mean, new_sd, old_mean, old_sd): approx_kl = np.log(new_sd / old_sd) + ( old_sd**2 + (old_mean - new_mean)**2) / (2.0 * new_sd**2 + 10**-8) - 0.5 approx_kl = np.sum(approx_kl, axis=1) approx_kl = np.mean(approx_kl) return approx_kl if max_kl == -1.0: # set max kl to a high val in case there is no constraint max_kl = 10**8 # Start total timer tstart = time.time() for update in range(1, int(total_timesteps // nbatch + 1)): obs, states, rewards, masks, actions, values, undisc_rwds, epinfos = runner.run( ) epinfobuf.extend(epinfos) old_mean, old_sd, old_neg_ll = model.get_mean_std_neg_ll(obs, actions) for step in range(len(obs)): cur_lr = lr.value() opt_pol_val = -10**8 old_params = get_flat() rms_weights_before_upd = model.get_opt_state() approx_kl = np.zeros((num_ent_coeff, num_lr)) epv = np.zeros((num_ent_coeff, num_lr)) rand_lr = lr_upper_bound * np.random.rand( num_lr) if lr_upper_bound is not None else [cur_lr] rand_lr = np.sort(rand_lr) rand_ent_coeff = ent_upper_bound * np.random.rand( num_ent_coeff) if ent_upper_bound is not None else [ent_coef] for nec in range(num_ent_coeff): # reset policy and optimiser set_from_flat(old_params) model.set_opt_state(rms_weights_before_upd) # get grads for loss fn with given entropy coeff policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values, rand_ent_coeff[nec]) new_params = get_flat() ent_grads = new_params - old_params # enumerate over different LR for nlr in range(num_lr): new_params = old_params + rand_lr[nlr] * ent_grads set_from_flat(new_params) new_mean, new_sd, new_neg_ll = model.get_mean_std_neg_ll( obs, actions) lik_ratio = np.exp(-new_neg_ll + old_neg_ll) est_pol_val = wis_estimate(nenvs, nsteps, undisc_rwds, lik_ratio) approx_kl[nec, nlr] = kl(new_mean, new_sd, old_mean, old_sd) epv[nec, nlr] = est_pol_val if (nec == 0 and nlr == 0) or (est_pol_val > opt_pol_val and approx_kl[nec, nlr] < max_kl): opt_pol_val = est_pol_val opt_pol_params = get_flat() opt_rms_wts = model.get_opt_state() opt_lr = rand_lr[nlr] opt_ent_coeff = rand_ent_coeff[nec] opt_kl = approx_kl[nec, nlr] # update policy and rms prop to optimal wts set_from_flat(opt_pol_params) model.set_opt_state(opt_rms_wts) # Shrink LR search space if too many get rejected if lr_upper_bound is not None: rejections = np.sum(approx_kl > max_kl) / num_lr if rejections > 0.8: lr_upper_bound *= 0.8 if rejections == 0: lr_upper_bound *= 1.25 nseconds = time.time() - tstart # Calculate the fps (frame per second) fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("opt_lr", float(opt_lr)) logger.record_tabular("opt_ent_coeff", float(opt_ent_coeff)) logger.record_tabular("approx_kl", float(opt_kl)) if lr_upper_bound is not None: logger.record_tabular("rejections", rejections) logger.record_tabular("lr_ub", lr_upper_bound) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() return model
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps writter = tf.summary.FileWriter( "/tmp/a2c_demo/1") # Change for SAT: this is to use tensorBoard A = tf.placeholder( tf.int32, [nbatch]) # Comments by Fei: this must be the action ADV = tf.placeholder( tf.float32, [nbatch]) # Comments by Fei: this must be the advantage R = tf.placeholder( tf.float32, [nbatch]) # Comments by Fei: this must be the reward LR = tf.placeholder( tf.float32, []) # Comments by Fei: this must be the learning rate step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) # Comments by Fei: pi is nbatch * nact pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) # writter.add_graph(sess.graph) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, p, has_state): """ policy : Internal Policy model such as SnakeModel.CNNPolicy p : Hyperparameters required for training """ sess = tf_util.make_session() # Tensorflow model initiallization step_model = policy(sess=sess, p=p, train_phase=False, has_state=has_state) # Deploy model settings train_model = policy(sess=sess, p=p, train_phase=True, has_state=has_state) # Training model settings saver = tf.train.Saver() #Step 2 : Initialize the training parameters A = tf.placeholder(tf.int32, [p.N_BATCH]) ADV = tf.placeholder(tf.float32, [p.N_BATCH]) R = tf.placeholder(tf.float32, [p.N_BATCH]) LR = tf.placeholder(tf.float32, []) #Step 3 : Define the loss Function neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) # pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * p.ENTROPY_COEFF + vf_loss * p.VALUE_FUNC_COEFF #Step 4 : Define the loss optimizer params = find_trainable_variables("model") grads = tf.gradients(loss, params) if p.MAX_GRAD_NORM is not None: grads, grad_norm = tf.clip_by_global_norm( grads, p.MAX_GRAD_NORM ) # Clipping the gradients to protect learned weights grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=p.RMS_DECAY, epsilon=p.EPSILON) _train = trainer.apply_gradients( grads) # This is the variable which will be used lr = Scheduler(v=p.LEARNING_RATE, nvalues=p.N_TIMESTEPS, schedule=p.LEARNING_RATE_SCHEDULE ) # Learning rate changes linearly or as per arguments # Step 5 : Write down the summary parameters to be used writer = tf.summary.FileWriter(p.LOG_PATH) #summary writer def train(obs, rewards, masks, actions, values, states): """ obs : batch x n x m x 1 snake matrix rewards : batch x 1 rewards corrosponding to action actions : batch x 1 discrete action taken values : batch x 1 output of value function during the training process """ advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, train_model.S: states, A: actions, ADV: advs, R: rewards, LR: cur_lr } policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): #ps = sess.run(params) #make_path(save_path) #joblib.dump(ps, save_path) saver.save(sess, save_path) def load(load_path): #loaded_params = joblib.load(load_path) #restores = [] #for p, loaded_p in zip(params, loaded_params): # restores.append(p.assign(loaded_p)) #ps = sess.run(restores) saver.restore(sess, load_path) def add_scalar_summary(tag, value, step): summary = tf.Summary( value=[tf.Summary.Value(tag=tag, simple_value=value)]) writer.add_summary(summary, step) # Expose the user to closure functions self.train = train self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.hidden_value = step_model.hidden_value self.initial_state = step_model.initial_state self.add_scalar_summary = add_scalar_summary self.save = save self.load = load # Initialize global variables and add tf graph tf.global_variables_initializer().run(session=sess) writer.add_graph(tf.get_default_graph()) #write graph
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nscripts=16, nsteps=20, nstack=4, ent_coef=0.1, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.001, kfac_clip=0.001, lrschedule='linear', alpha=0.99, epsilon=1e-5): config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nsml.bind(sess=sess) #nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) XY0 = tf.placeholder(tf.int32, [nbatch]) XY1 = tf.placeholder(tf.int32, [nbatch]) # ADV == TD_TARGET - values ADV = tf.placeholder(tf.float32, [nbatch]) TD_TARGET = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy( sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model2 = train_model = policy( sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) # Policy 1 : Base Action : train_model.pi label = A script_mask = tf.concat( [ tf.zeros([nscripts * nsteps, 1]), tf.ones([(nprocs - nscripts) * nsteps, 1]) ], axis=0) pi = train_model.pi pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0 pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi, labels=A) neglogpac *= tf.stop_gradient(pac_weight) inv_A = 1.0 - tf.cast(A, tf.float32) xy0_mask = tf.cast(A, tf.float32) xy1_mask = tf.cast(A, tf.float32) condition0 = tf.equal(xy0_mask, 2) xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask) xy0_mask = 1.0 - xy0_mask condition1 = tf.equal(xy1_mask, 2) xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask) # One hot representation of chosen marine. # [batch_size, 2] pi_xy0 = train_model.pi_xy0 pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0 pac_weight = tf.reduce_sum( pac_weight * tf.one_hot(XY0, depth=1024), axis=1) logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy0, labels=XY0) logpac_xy0 *= tf.stop_gradient(pac_weight) logpac_xy0 *= tf.cast(xy0_mask, tf.float32) pi_xy1 = train_model.pi_xy1 pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0 pac_weight = tf.reduce_sum( pac_weight * tf.one_hot(XY0, depth=1024), axis=1) # 1D? 2D? logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy1, labels=XY1) logpac_xy1 *= tf.stop_gradient(pac_weight) logpac_xy1 *= tf.cast(xy1_mask, tf.float32) pg_loss = tf.reduce_mean(ADV * neglogpac) pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0) pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1) vf_ = tf.squeeze(train_model.vf) vf_r = tf.concat( [ tf.ones([nscripts * nsteps, 1]), tf.zeros([(nprocs - nscripts) * nsteps, 1]) ], axis=0) * TD_TARGET vf_masked = vf_ * script_mask + vf_r #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps] vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET)) entropy_a = tf.reduce_mean(cat_entropy(train_model.pi)) entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0)) entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1)) entropy = entropy_a + entropy_xy0 + entropy_xy1 loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) self.logits = logits = train_model.pi # xy0 self.params_common = params_common = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') self.params_xy0 = params_xy0 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy0 = grads_xy0 = tf.gradients( train_loss_xy0, params_xy0) if max_grad_norm is not None: grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm) grads_xy0 = list(zip(grads_xy0, params_xy0)) trainer_xy0 = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy0 = trainer_xy0.apply_gradients(grads_xy0) # xy1 self.params_xy1 = params_xy1 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy1 = grads_xy1 = tf.gradients( train_loss_xy1, params_xy1) if max_grad_norm is not None: grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm) grads_xy1 = list(zip(grads_xy1, params_xy1)) trainer_xy1 = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy1 = trainer_xy1.apply_gradients(grads_xy1) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, td_targets, masks, actions, xy0, xy1, values): advs = td_targets - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, XY0: xy0, XY1: xy1, ADV: advs, TD_TARGET: td_targets, PG_LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _, \ policy_loss_xy0, policy_entropy_xy0, _, \ policy_loss_xy1, policy_entropy_xy1, _ = sess.run( [pg_loss, vf_loss, entropy, _train, pg_loss_xy0, entropy_xy0, _train_xy0, pg_loss_xy1, entropy_xy1, _train_xy1], td_map) return policy_loss, value_loss, policy_entropy, \ policy_loss_xy0, policy_entropy_xy0, \ policy_loss_xy1, policy_entropy_xy1 def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state print("global_variables_initializer start") tf.global_variables_initializer().run(session=sess) print("global_variables_initializer complete")
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, lambda_dist=0.01, total_timesteps=None, lrschedule='linear'): sess = tf.get_default_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) config = Config() act_model = policy(config) config.reuse = True train_model = policy(config) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.logits, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.logits)) aux_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.rp_logits, labels=A) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + aux_loss * lambda_dist params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) saver = tf.train.Saver() def train(obs, rs, rr, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr, train_model.inputs_s: rs, train_model.inputs_r: rr } policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): saver.save(sess, save_path + 'model.ckpt') def load(load_path): saver.restore(sess, load_path + 'model.ckpt') self.train = train self.train_model = train_model self.act_model = act_model self.act = act_model.act self.value = act_model.value self.save = save self.load = load
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) # actions D = tf.placeholder(tf.float32, [nbatch]) # dones R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's LR = tf.placeholder(tf.float32, []) eps = 1e-6 step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True) params = find_trainable_variables("model") print("Params {}".format(len(params))) for var in params: print(var) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) print(v.name) return v with tf.variable_scope("", custom_getter=custom_getter, reuse=True): polyak_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i v = tf.reduce_sum(train_model.pi * train_model.q, axis=-1) # shape is [nenvs * (nsteps + 1)] # strip off last step f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model.pi, polyak_model.pi, train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, A) q_i = get_by_index(q, A) # Compute ratios for importance truncation rho = f / (MU + eps) rho_i = get_by_index(rho, A) # Calculate Q_retrace targets qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient( adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]) ) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2) gain_bc = tf.reduce_sum( logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis=1) #IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]] * 2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = -f_pol / ( f + eps ) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g / ( nenvs * nsteps ) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) grads = [ gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params) ] avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging run_ops = [ _train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ] names_ops = [ 'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: run_ops = run_ops + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] names_ops = names_ops + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] def train(obs, actions, rewards, dones, mus, states, masks, steps): cur_lr = lr.value_steps(steps) td_map = { train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks td_map[polyak_model.S] = states td_map[polyak_model.M] = masks return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) self.train = train self.save = save self.train_model = train_model self.step_model = step_model self.step = step_model.step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', network='cnn', prio_args=None): self.prio_args = prio_args sess = tf_util.get_session() nenvs = self.get_active_envs(env) nbatch = nenvs * nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) # our TD evaluating network A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Value loss vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) # TD loss # td_loss = losses.mean_squared_error(tf.squeeze(train_model.dt), TD) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef """prio model""" with tf.variable_scope('a2c_model_prio', reuse=tf.AUTO_REUSE): # prio_model = policy(nbatch, nsteps, sess) prio_model = MyNN(env, nbatch, network) P_R = tf.placeholder(tf.float32, [nbatch]) PRIO = tf.placeholder(tf.float32, [nbatch]) P_LR = tf.placeholder(tf.float32, []) # prio_model_loss = losses.mean_squared_error(tf.squeeze(prio_model.out), P_R) # Reward prio_model_loss = losses.mean_squared_error(tf.squeeze(prio_model.out), PRIO) # TD Error # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") params_prio = find_trainable_variables("a2c_model_prio") # 2. Calculate the gradients grads = tf.gradients(loss, params) prio_grads = tf.gradients(prio_model_loss, params_prio) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) prio_grads, prio_grad_norm = tf.clip_by_global_norm( prio_grads, max_grad_norm) grads = list(zip(grads, params)) prio_grads = list(zip(prio_grads, params_prio)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) prio_trainer = tf.train.RMSPropOptimizer(learning_rate=P_LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) _prio_train = prio_trainer.apply_gradients(prio_grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) prio_loss = 0 if self.prio_args is not None: prio_values = GetValuesForPrio(self.prio_args['prio_type'], self.prio_args['prio_param'], advs, rewards) prio_td_map = { prio_model.X: obs, P_R: rewards, P_LR: cur_lr, PRIO: prio_values } prio_loss, _, p_td = sess.run( [prio_model_loss, _prio_train, PRIO], prio_td_map) # mb aranged as 1D-vector = [[env_1: n1, ..., n_nstep],...,[env_n_active]] # need to take last value of each env's buffer self.prio_score = prio_values[list( filter(lambda x: x % nsteps == (nsteps - 1), range(len(prio_values))))] return policy_loss, value_loss, policy_entropy, prio_loss self.train = train self.train_model = train_model self.step_model = step_model self.prio_model = prio_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', param=None): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False, param=param) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True, param=param) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta, icm ): sess = get_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) # actions D = tf.placeholder(tf.float32, [nbatch]) # dones R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's LR = tf.placeholder(tf.float32, []) eps = 1e-6 step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape) train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape) with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE): step_model = policy(nbatch=nenvs , nsteps=1,observ_placeholder=step_ob_placeholder, sess=sess) train_model = policy(nbatch=nbatch , nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess) params = find_trainable_variables("acer_model") print("Params {}".format(len(params))) # for var in params: # print(var) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) print(v.name) return v with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True): polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # shape is [n_envs * (n_steps + 1)] # action probability distributions according to train_model, polyak_model and step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(train_model.pi) polyak_model_p = tf.nn.softmax(polyak_model.pi) step_model_p = tf.nn.softmax(step_model.pi) # train model policy probility and train model q value v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)] # strip off last step # dictribution_f , f_polyak, q_value f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, A) q_i = get_by_index(q, A) # Compute ratios for importance truncation rho = f / (MU + eps) rho_i = get_by_index(rho, A) # Calculate Q_retrace targets # passed # R = reward , D = done_ph , v = value ,... rest is same qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # f is distribution here # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) # v is value here check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling if icm is None : adv = qret - v # v is value here else : # print("Adv Normalization") # > Advantage Normalization adv = qret - v # m , s = get_mean_and_std(icm_adv) # advs = (icm_adv - m) / (s + 1e-7) # > Advantage Normalization logf = tf.log(f_i + eps) # c is correction term # importance weight clipping factor : c gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2) gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f loss_bc= -tf.reduce_mean(gain_bc) # IMP: This is sum, as expectation wrt f loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]]*2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)] avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) if icm is not None : # print("with ICM") grads = grads + icm.pred_grads_and_vars trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging if icm is not None : # print("With ICM") run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads , icm.forw_loss , icm.inv_loss, icm.icm_loss] names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ,'icm.forw_loss' , 'icm.inv_loss', 'icm.icm_loss' ] if trust_region: run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] else : run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ] names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] def train(obs, actions, rewards, dones, mus, states, masks, steps, next_states, icm_actions ): cur_lr = lr.value_steps(steps) if icm is not None : print("with ICM ") td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr , icm.state_:obs, icm.next_state_ : next_states , icm.action_ : icm_actions} else : td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks td_map[polyak_model.S] = states td_map[polyak_model.M] = masks return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train def _step(observation, **kwargs): return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs) self.train = train self.save = functools.partial(save_variables, sess=sess, variables=params) self.train_model = train_model self.step_model = step_model self._step = _step self.step = self.step_model.step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs*nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Value loss vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, n_envs, total_timesteps, nprocs=32, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear'): """ The ACKTR (Actor Critic using Kronecker-Factored Trust Region) model class, https://arxiv.org/abs/1708.05144 :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param ob_space: (Gym Space) The observation space :param ac_space: (Gym Space) The action space :param n_envs: (int) The number of environments :param total_timesteps: (int) The total number of timesteps for training the model :param nprocs: (int) The number of threads for TensorFlow operations :param n_steps: (int) The number of steps to run for each environment :param ent_coef: (float) The weight for the entropic loss :param vf_coef: (float) The weight for the loss on the value function :param vf_fisher_coef: (float) The weight for the fisher loss on the value function :param learning_rate: (float) The initial learning rate for the RMS prop optimizer :param max_grad_norm: (float) The clipping value for the maximum gradient :param kfac_clip: (float) gradient clipping for Kullback leiber :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') """ config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) n_batch = n_envs * n_steps action_ph = tf.placeholder(tf.int32, [n_batch]) advs_ph = tf.placeholder(tf.float32, [n_batch]) rewards_ph = tf.placeholder(tf.float32, [n_batch]) pg_lr_ph = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, n_envs, 1, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, n_envs * n_steps, n_steps, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.policy, labels=action_ph) self.logits = train_model.policy # training loss pg_loss = tf.reduce_mean(advs_ph * logpac) entropy = tf.reduce_mean(calc_entropy(train_model.policy)) pg_loss = pg_loss - ent_coef * entropy vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.value_fn + tf.random_normal( tf.shape(train_model.value_fn)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=pg_lr_ph, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.learning_rate = Scheduler(initial_value=learning_rate, n_values=total_timesteps, schedule=lr_schedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for _ in range(len(obs)): cur_lr = self.learning_rate.value() td_map = { train_model.obs_ph: obs, action_ph: actions, advs_ph: advs, rewards_ph: rewards, pg_lr_ph: cur_lr } if states is not None: td_map[train_model.states_ph] = states td_map[train_model.masks_ph] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): session_params = sess.run(params) joblib.dump(session_params, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for param, loaded_p in zip(params, loaded_params): restores.append(param.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(20e6), lrschedule='linear'): sess = tf.get_default_session() nbatch = nenvs * nsteps step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) A = train_model.pdtype.sample_placeholder([nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(train_model.pd.entropy()) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, discounted_rewards, rewards, masks, prev_actions, actions, values, dones): advs = discounted_rewards - values for step in range(len(obs)): cur_lr = lr.value() # reshape actions, rewards, and dones to have first dimension of size nenvs*nsteps, existing second dimension # this is already done for obs rews = np.reshape(rewards, (nbatch, 1)) ds = np.reshape(np.asarray(dones, dtype=np.float32), (nbatch, 1)) if len(ac_space.shape) == 0: prev_actions = np.reshape(prev_actions, (nbatch, )) one_hot = np.eye(ac_space.n)[prev_actions] for i in range(nbatch): if prev_actions[i] == -1: one_hot[i, :] = np.zeros((ac_space.n, ), dtype=np.int) x = np.concatenate((obs, one_hot, rews, ds), axis=1) actions = np.reshape(actions, (nbatch, )) else: prev_actions = np.reshape(prev_actions, (nbatch, ac_space.shape[0])) x = np.concatenate((obs, prev_actions, rews, ds), axis=1) td_map = { train_model.X: x, A: actions, ADV: advs, R: discounted_rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)