def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV * logpac) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape( train_model.vf)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, PG_LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV*logpac) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params=params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss,params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads,params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr} if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nscripts=16, nsteps=20, nstack=4, ent_coef=0.1, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.001, kfac_clip=0.001, lrschedule='linear', alpha=0.99, epsilon=1e-5): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nsml.bind(sess=sess) #nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) XY0 = tf.placeholder(tf.int32, [nbatch]) XY1 = tf.placeholder(tf.int32, [nbatch]) # ADV == TD_TARGET - values ADV = tf.placeholder(tf.float32, [nbatch]) TD_TARGET = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) # Policy 1 : Base Action : train_model.pi label = A script_mask = tf.concat([ tf.zeros([nscripts * nsteps, 1]), tf.ones([(nprocs - nscripts) * nsteps, 1]) ], axis=0) pi = train_model.pi pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0 pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi, labels=A) neglogpac *= tf.stop_gradient(pac_weight) inv_A = 1.0 - tf.cast(A, tf.float32) xy0_mask = tf.cast(A, tf.float32) xy1_mask = tf.cast(A, tf.float32) condition0 = tf.equal(xy0_mask, 2) xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask) xy0_mask = 1.0 - xy0_mask condition1 = tf.equal(xy1_mask, 2) xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask) # One hot representation of chosen marine. # [batch_size, 2] pi_xy0 = train_model.pi_xy0 pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0 pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024), axis=1) logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy0, labels=XY0) logpac_xy0 *= tf.stop_gradient(pac_weight) logpac_xy0 *= tf.cast(xy0_mask, tf.float32) pi_xy1 = train_model.pi_xy1 pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0 pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024), axis=1) # 1D? 2D? logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy1, labels=XY1) logpac_xy1 *= tf.stop_gradient(pac_weight) logpac_xy1 *= tf.cast(xy1_mask, tf.float32) pg_loss = tf.reduce_mean(ADV * neglogpac) pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0) pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1) vf_ = tf.squeeze(train_model.vf) vf_r = tf.concat([ tf.ones([nscripts * nsteps, 1]), tf.zeros([(nprocs - nscripts) * nsteps, 1]) ], axis=0) * TD_TARGET vf_masked = vf_ * script_mask + vf_r #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps] vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET)) entropy_a = tf.reduce_mean(cat_entropy(train_model.pi)) entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0)) entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1)) entropy = entropy_a + entropy_xy0 + entropy_xy1 loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) self.logits = logits = train_model.pi # xy0 self.params_common = params_common = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') self.params_xy0 = params_xy0 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy0 = grads_xy0 = tf.gradients( train_loss_xy0, params_xy0) if max_grad_norm is not None: grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm) grads_xy0 = list(zip(grads_xy0, params_xy0)) trainer_xy0 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy0 = trainer_xy0.apply_gradients(grads_xy0) # xy1 self.params_xy1 = params_xy1 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy1 = grads_xy1 = tf.gradients( train_loss_xy1, params_xy1) if max_grad_norm is not None: grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm) grads_xy1 = list(zip(grads_xy1, params_xy1)) trainer_xy1 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy1 = trainer_xy1.apply_gradients(grads_xy1) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, td_targets, masks, actions, xy0, xy1, values): advs = td_targets - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, XY0: xy0, XY1: xy1, ADV: advs, TD_TARGET: td_targets, PG_LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _, \ policy_loss_xy0, policy_entropy_xy0, _, \ policy_loss_xy1, policy_entropy_xy1, _ = sess.run( [pg_loss, vf_loss, entropy, _train, pg_loss_xy0, entropy_xy0, _train_xy0, pg_loss_xy1, entropy_xy1, _train_xy1], td_map) return policy_loss, value_loss, policy_entropy, \ policy_loss_xy0, policy_entropy_xy0, \ policy_loss_xy1, policy_entropy_xy1 def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state print("global_variables_initializer start") tf.global_variables_initializer().run(session=sess) print("global_variables_initializer complete")
def __init__(self, policy, ob_space, ac_space, nenvs, expert_nbatch, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, vf_expert_coef=0.5 * 0.0, expert_coeff=1.0, exp_adv_est='reward', lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): # create tf stuff config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) # the actual model nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) A_EXP = tf.placeholder(tf.int32, [expert_nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) ADV_EXP = tf.placeholder(tf.float32, [expert_nbatch]) R = tf.placeholder(tf.float32, [nbatch]) R_EXP = tf.placeholder(tf.float32, [expert_nbatch]) PG_LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) eval_step_model = policy(sess, ob_space, ac_space, 1, 1, reuse=True) train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) expert_train_model = policy(sess, ob_space, ac_space, expert_nbatch, 1, reuse=True) logpac_expert = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=expert_train_model.pi, labels=A_EXP) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) _, acc = tf.metrics.accuracy(labels=A, predictions=tf.argmax(train_model.pi, 1)) ## training loss pg_loss = tf.reduce_mean(ADV*logpac) pg_expert_loss = tf.reduce_mean(ADV_EXP * logpac_expert) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) vf_expert_loss = tf.reduce_mean(mse(tf.squeeze(expert_train_model.vf), R_EXP)) train_loss = pg_loss + vf_coef * vf_loss + expert_coeff * pg_expert_loss + vf_expert_coef * vf_expert_loss self.check = check = tf.add_check_numerics_ops() ## Fisher loss construction pg_fisher_loss = -tf.reduce_mean(logpac) # + logpac_expert) # pg_expert_fisher_loss = -tf.reduce_mean(logpac_expert) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) vf_fisher_loss = - vf_fisher_coef * tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) joint_fisher_loss = pg_fisher_loss + vf_fisher_loss params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=20, max_grad_norm=max_grad_norm ) # why is this unused? update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads,params))) self.q_runner = q_runner lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values, expert_obs, expert_rewards, expert_actions, expert_values): if exp_adv_est == 'critic': expert_advs = np.clip(expert_rewards - expert_values, a_min=0, a_max=None) elif exp_adv_est == 'reward': expert_advs = expert_rewards elif exp_adv_est == 'simple': expert_advs = np.ones_like(expert_rewards) else: raise ValueError("Unknown expert advantage estimator {}".format(exp_adv_est)) advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X:obs, expert_train_model.X: expert_obs, A_EXP: expert_actions, A:actions, ADV:advs, ADV_EXP: expert_advs, R:rewards, PG_LR:cur_lr, R_EXP: expert_rewards } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, policy_expert_loss, value_loss, policy_entropy, train_accuracy, _, grads_to_check = sess.run( [pg_loss, pg_expert_loss, vf_loss, entropy, acc, train_op, grads], td_map ) for grad in grads_to_check: if np.isnan(grad).any(): print("ojojoj grad is nan") return policy_loss, policy_expert_loss, value_loss, policy_entropy, train_accuracy def save(save_path): print("Writing model to {}".format(save_path)) ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) def eval_step(obs, eval_type): td_map = {eval_step_model.X: [obs]} logits = sess.run(eval_step_model.pi, td_map)[0] if eval_type == 'argmax': act = logits.argmax() if np.random.rand() < 0.01: act = ac_space.sample() return act elif eval_type == 'prob': # probs = func(s[None, :, :, :])[0][0] x = logits e_x = np.exp(x - np.max(x)) probs = e_x / e_x.sum(axis=0) act = np.random.choice(range(probs.shape[-1]), 1, p=probs)[0] return act else: raise ValueError("Unknown eval type {}".format(eval_type)) self.model = step_model self.model2 = train_model self.expert_train_model = expert_train_model self.vf_fisher = vf_fisher_loss self.pg_fisher = pg_fisher_loss self.joint_fisher = joint_fisher_loss self.params = params self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.eval_step = eval_step self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess) tf.local_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) #nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) SUB3 = tf.placeholder(tf.int32, [nbatch]) SUB4 = tf.placeholder(tf.int32, [nbatch]) SUB5 = tf.placeholder(tf.int32, [nbatch]) SUB6 = tf.placeholder(tf.int32, [nbatch]) SUB7 = tf.placeholder(tf.int32, [nbatch]) SUB8 = tf.placeholder(tf.int32, [nbatch]) SUB9 = tf.placeholder(tf.int32, [nbatch]) SUB10 = tf.placeholder(tf.int32, [nbatch]) SUB11 = tf.placeholder(tf.int32, [nbatch]) SUB12 = tf.placeholder(tf.int32, [nbatch]) X0 = tf.placeholder(tf.int32, [nbatch]) Y0 = tf.placeholder(tf.int32, [nbatch]) X1 = tf.placeholder(tf.int32, [nbatch]) Y1 = tf.placeholder(tf.int32, [nbatch]) X2 = tf.placeholder(tf.int32, [nbatch]) Y2 = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) # Policy 1 : Base Action : train_model.pi label = A logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) logpac_sub3 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub3, labels=SUB3) logpac_sub4 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub4, labels=SUB4) logpac_sub5 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub5, labels=SUB5) logpac_sub6 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub6, labels=SUB6) logpac_sub7 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub7, labels=SUB7) logpac_sub8 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub8, labels=SUB8) logpac_sub9 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub9, labels=SUB9) logpac_sub10 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub10, labels=SUB10) logpac_sub11 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub11, labels=SUB11) logpac_sub12 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub12, labels=SUB12) logpac_x0 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x0, labels=X0) logpac_y0 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y0, labels=Y0) logpac_x1 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x1, labels=X1) logpac_y1 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y1, labels=Y1) logpac_x2 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x2, labels=X2) logpac_y2 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y2, labels=Y2) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV*logpac) * tf.reduce_mean(ADV) pg_loss_sub3 = tf.reduce_mean(ADV*logpac_sub3) * tf.reduce_mean(ADV) pg_loss_sub4 = tf.reduce_mean(ADV*logpac_sub4) * tf.reduce_mean(ADV) pg_loss_sub5 = tf.reduce_mean(ADV*logpac_sub5) * tf.reduce_mean(ADV) pg_loss_sub6 = tf.reduce_mean(ADV*logpac_sub6) * tf.reduce_mean(ADV) pg_loss_sub7 = tf.reduce_mean(ADV*logpac_sub7) * tf.reduce_mean(ADV) pg_loss_sub8 = tf.reduce_mean(ADV*logpac_sub8) * tf.reduce_mean(ADV) pg_loss_sub9 = tf.reduce_mean(ADV*logpac_sub9) * tf.reduce_mean(ADV) pg_loss_sub10 = tf.reduce_mean(ADV*logpac_sub10) * tf.reduce_mean(ADV) pg_loss_sub11 = tf.reduce_mean(ADV*logpac_sub11) * tf.reduce_mean(ADV) pg_loss_sub12 = tf.reduce_mean(ADV*logpac_sub12) * tf.reduce_mean(ADV) pg_loss_x0 = tf.reduce_mean(ADV*logpac_x0) * tf.reduce_mean(ADV) pg_loss_y0 = tf.reduce_mean(ADV*logpac_y0) * tf.reduce_mean(ADV) pg_loss_x1 = tf.reduce_mean(ADV*logpac_x1) * tf.reduce_mean(ADV) pg_loss_y1 = tf.reduce_mean(ADV*logpac_y1) * tf.reduce_mean(ADV) pg_loss_x2 = tf.reduce_mean(ADV*logpac_x2) * tf.reduce_mean(ADV) pg_loss_y2 = tf.reduce_mean(ADV*logpac_y2) * tf.reduce_mean(ADV) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) entropy_sub3 = tf.reduce_mean(cat_entropy(train_model.pi_sub3)) entropy_sub4 = tf.reduce_mean(cat_entropy(train_model.pi_sub4)) entropy_sub5 = tf.reduce_mean(cat_entropy(train_model.pi_sub5)) entropy_sub6 = tf.reduce_mean(cat_entropy(train_model.pi_sub6)) entropy_sub7 = tf.reduce_mean(cat_entropy(train_model.pi_sub7)) entropy_sub8 = tf.reduce_mean(cat_entropy(train_model.pi_sub8)) entropy_sub9 = tf.reduce_mean(cat_entropy(train_model.pi_sub9)) entropy_sub10 = tf.reduce_mean(cat_entropy(train_model.pi_sub10)) entropy_sub11 = tf.reduce_mean(cat_entropy(train_model.pi_sub11)) entropy_sub12 = tf.reduce_mean(cat_entropy(train_model.pi_sub12)) entropy_x0 = tf.reduce_mean(cat_entropy(train_model.pi_x0)) entropy_y0 = tf.reduce_mean(cat_entropy(train_model.pi_y0)) entropy_x1 = tf.reduce_mean(cat_entropy(train_model.pi_x1)) entropy_y1 = tf.reduce_mean(cat_entropy(train_model.pi_y1)) entropy_x2 = tf.reduce_mean(cat_entropy(train_model.pi_x2)) entropy_y2 = tf.reduce_mean(cat_entropy(train_model.pi_y2)) pg_loss = pg_loss - ent_coef * entropy pg_loss_sub3 = pg_loss_sub3 - ent_coef * entropy_sub3 pg_loss_sub4 = pg_loss_sub4 - ent_coef * entropy_sub4 pg_loss_sub5 = pg_loss_sub5 - ent_coef * entropy_sub5 pg_loss_sub6 = pg_loss_sub6 - ent_coef * entropy_sub6 pg_loss_sub7 = pg_loss_sub7 - ent_coef * entropy_sub7 pg_loss_sub8 = pg_loss_sub8 - ent_coef * entropy_sub8 pg_loss_sub9 = pg_loss_sub9 - ent_coef * entropy_sub9 pg_loss_sub10 = pg_loss_sub10 - ent_coef * entropy_sub10 pg_loss_sub11 = pg_loss_sub11 - ent_coef * entropy_sub11 pg_loss_sub12 = pg_loss_sub12 - ent_coef * entropy_sub12 pg_loss_x0 = pg_loss_x0 - ent_coef * entropy_x0 pg_loss_y0 = pg_loss_y0 - ent_coef * entropy_y0 pg_loss_x1 = pg_loss_x1 - ent_coef * entropy_x1 pg_loss_y1 = pg_loss_y1 - ent_coef * entropy_y1 pg_loss_x2 = pg_loss_x2 - ent_coef * entropy_x2 pg_loss_y2 = pg_loss_y2 - ent_coef * entropy_y2 vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) self.params = params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model') self.params_common = params_common = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') self.params_pi1 = params_pi1 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/pi1') + params_common # Base Action train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss print("train_loss :", train_loss, " pg_fisher :", pg_fisher_loss, " vf_fisher :", vf_fisher_loss, " joint_fisher_loss :", joint_fisher_loss) self.grads_check = grads = tf.gradients(train_loss, params_pi1) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params_pi1) train_op, q_runner = optim.apply_gradients(list(zip(grads, params_pi1))) self.q_runner = q_runner # sub3 self.params_sub3 = params_sub3 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub3') train_loss_sub3 = pg_loss_sub3 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_sub3 = pg_fisher_loss_sub3 = -tf.reduce_mean(logpac_sub3) self.joint_fisher_sub3 = joint_fisher_loss_sub3 = pg_fisher_loss_sub3 + vf_fisher_loss self.grads_check_sub3 = grads_sub3 = tf.gradients(train_loss_sub3, params_sub3) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub3, var_list=params_sub3) train_op_sub3, q_runner_sub3 = optim.apply_gradients(list(zip(grads_sub3, params_sub3))) self.q_runner_sub3 = q_runner_sub3 # sub4 self.params_sub4 = params_sub4 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub4') train_loss_sub4 = pg_loss_sub4 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_sub4 = pg_fisher_loss_sub4 = -tf.reduce_mean(logpac_sub4) self.joint_fisher_sub4 = joint_fisher_loss_sub4 = pg_fisher_loss_sub4 + vf_fisher_loss self.grads_check_sub4 = grads_sub4 = tf.gradients(train_loss_sub4, params_sub4) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub4, var_list=params_sub4) train_op_sub4, q_runner_sub4 = optim.apply_gradients(list(zip(grads_sub4, params_sub4))) self.q_runner_sub4 = q_runner_sub4 # sub5 self.params_sub5 = params_sub5 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub5') train_loss_sub5 = pg_loss_sub5 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_sub5 = pg_fisher_loss_sub5 = -tf.reduce_mean(logpac_sub5) self.joint_fisher_sub5 = joint_fisher_loss_sub5 = pg_fisher_loss_sub5 + vf_fisher_loss self.grads_check_sub5 = grads_sub5 = tf.gradients(train_loss_sub5, params_sub5) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub5, var_list=params_sub5) train_op_sub5, q_runner_sub5 = optim.apply_gradients(list(zip(grads_sub5, params_sub5))) self.q_runner_sub4 = q_runner_sub5 # sub6 self.params_sub6 = params_sub6 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub6') train_loss_sub6 = pg_loss_sub6 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_sub6 = pg_fisher_loss_sub6 = -tf.reduce_mean(logpac_sub6) self.joint_fisher_sub6 = joint_fisher_loss_sub6 = pg_fisher_loss_sub6 + vf_fisher_loss self.grads_check_sub6 = grads_sub6 = tf.gradients(train_loss_sub6, params_sub6) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub6, var_list=params_sub6) train_op_sub6, q_runner_sub6 = optim.apply_gradients(list(zip(grads_sub6, params_sub6))) self.q_runner_sub6 = q_runner_sub6 # sub7 self.params_sub7 = params_sub7 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub7') train_loss_sub7 = pg_loss_sub7 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_sub7 = pg_fisher_loss_sub7 = -tf.reduce_mean(logpac_sub7) self.joint_fisher_sub7 = joint_fisher_loss_sub7 = pg_fisher_loss_sub7 + vf_fisher_loss self.grads_check_sub7 = grads_sub7 = tf.gradients(train_loss_sub7, params_sub7) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub7, var_list=params_sub7) train_op_sub7, q_runner_sub7 = optim.apply_gradients(list(zip(grads_sub7, params_sub7))) self.q_runner_sub7 = q_runner_sub7 # sub8 self.params_sub8 = params_sub8 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub8') train_loss_sub8 = pg_loss_sub8 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_sub8 = pg_fisher_loss_sub8 = -tf.reduce_mean(logpac_sub8) self.joint_fisher_sub8 = joint_fisher_loss_sub8 = pg_fisher_loss_sub8 + vf_fisher_loss self.grads_check_sub8 = grads_sub8 = tf.gradients(train_loss_sub8, params_sub8) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub8, var_list=params_sub8) train_op_sub8, q_runner_sub8 = optim.apply_gradients(list(zip(grads_sub8, params_sub8))) self.q_runner_sub8 = q_runner_sub8 # sub9 self.params_sub9 = params_sub9 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub9') train_loss_sub9 = pg_loss_sub9 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_sub9 = pg_fisher_loss_sub9 = -tf.reduce_mean(logpac_sub9) self.joint_fisher_sub9 = joint_fisher_loss_sub9 = pg_fisher_loss_sub9 + vf_fisher_loss self.grads_check_sub9 = grads_sub9 = tf.gradients(train_loss_sub9, params_sub9) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub9, var_list=params_sub9) train_op_sub9, q_runner_sub9 = optim.apply_gradients(list(zip(grads_sub9, params_sub9))) self.q_runner_sub9 = q_runner_sub9 # sub10 self.params_sub10 = params_sub10 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub10') train_loss_sub10 = pg_loss_sub10 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_sub10 = pg_fisher_loss_sub10 = -tf.reduce_mean(logpac_sub10) self.joint_fisher_sub10 = joint_fisher_loss_sub10 = pg_fisher_loss_sub10 + vf_fisher_loss self.grads_check_sub10 = grads_sub10 = tf.gradients(train_loss_sub10, params_sub10) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub10, var_list=params_sub10) train_op_sub10, q_runner_sub10 = optim.apply_gradients(list(zip(grads_sub10, params_sub10))) self.q_runner_sub10 = q_runner_sub10 # sub11 self.params_sub11 = params_sub11 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub11') train_loss_sub11 = pg_loss_sub11 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_sub11 = pg_fisher_loss_sub11 = -tf.reduce_mean(logpac_sub11) self.joint_fisher_sub11 = joint_fisher_loss_sub11 = pg_fisher_loss_sub11 + vf_fisher_loss self.grads_check_sub11 = grads_sub11 = tf.gradients(train_loss_sub11, params_sub11) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub11, var_list=params_sub11) train_op_sub11, q_runner_sub11 = optim.apply_gradients(list(zip(grads_sub11, params_sub11))) self.q_runner_sub11 = q_runner_sub11 # sub12 self.params_sub12 = params_sub12 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub12') train_loss_sub12 = pg_loss_sub12 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_sub12 = pg_fisher_loss_sub12 = -tf.reduce_mean(logpac_sub12) self.joint_fisher_sub12 = joint_fisher_loss_sub12 = pg_fisher_loss_sub12 + vf_fisher_loss self.grads_check_sub12 = grads_sub12 = tf.gradients(train_loss_sub12, params_sub12) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub12, var_list=params_sub12) train_op_sub12, q_runner_sub12 = optim.apply_gradients(list(zip(grads_sub12, params_sub12))) self.q_runner_sub12 = q_runner_sub12 # x0 self.params_xy0 = params_xy0 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common train_loss_x0 = pg_loss_x0 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_x0 = pg_fisher_loss_x0 = -tf.reduce_mean(logpac_x0) self.joint_fisher_x0 = joint_fisher_loss_x0 = pg_fisher_loss_x0 + vf_fisher_loss self.grads_check_x0 = grads_x0 = tf.gradients(train_loss_x0, params_xy0) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_x0, var_list=params_xy0) train_op_x0, q_runner_x0 = optim.apply_gradients(list(zip(grads_x0, params_xy0))) self.q_runner_x0 = q_runner_x0 # y0 train_loss_y0 = pg_loss_y0 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_y0 = pg_fisher_loss_y0 = -tf.reduce_mean(logpac_y0) self.joint_fisher_y0 = joint_fisher_loss_y0 = pg_fisher_loss_y0 + vf_fisher_loss self.grads_check_y0 = grads_y0 = tf.gradients(train_loss_y0, params_xy0) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_y0, var_list=params_xy0) train_op_y0, q_runner_y0 = optim.apply_gradients(list(zip(grads_y0, params_xy0))) self.q_runner_y0 = q_runner_y0 # x1 self.params_xy1 = params_xy1 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common train_loss_x1 = pg_loss_x1 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_x1 = pg_fisher_loss_x1 = -tf.reduce_mean(logpac_x1) self.joint_fisher_x1 = joint_fisher_loss_x1 = pg_fisher_loss_x1 + vf_fisher_loss self.grads_check_x1 = grads_x1 = tf.gradients(train_loss_x1, params_xy1) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_x1, var_list=params_xy1) train_op_x1, q_runner_x1 = optim.apply_gradients(list(zip(grads_x1, params_xy1))) self.q_runner_x1 = q_runner_x1 # y1 train_loss_y1 = pg_loss_y1 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_y1 = pg_fisher_loss_y1 = -tf.reduce_mean(logpac_y1) self.joint_fisher_y1 = joint_fisher_loss_y1 = pg_fisher_loss_y1 + vf_fisher_loss self.grads_check_y1 = grads_y1 = tf.gradients(train_loss_y1, params_xy1) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_y1, var_list=params_xy1) train_op_y1, q_runner_y1 = optim.apply_gradients(list(zip(grads_y1, params_xy1))) self.q_runner_y1 = q_runner_y1 # x2 self.params_xy2 = params_xy2 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy2') + params_common train_loss_x2 = pg_loss_x2 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_x2 = pg_fisher_loss_x2 = -tf.reduce_mean(logpac_x2) self.joint_fisher_x2 = joint_fisher_loss_x2 = pg_fisher_loss_x2 + vf_fisher_loss self.grads_check_x2 = grads_x2 = tf.gradients(train_loss_x2, params_xy2) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_x2, var_list=params_xy2) train_op_x2, q_runner_x2 = optim.apply_gradients(list(zip(grads_x2, params_xy2))) self.q_runner_x2 = q_runner_x2 # y2 train_loss_y2 = pg_loss_y2 + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher_y2 = pg_fisher_loss_y2 = -tf.reduce_mean(logpac_y2) self.joint_fisher_y2 = joint_fisher_loss_y2 = pg_fisher_loss_y2 + vf_fisher_loss self.grads_check_y2 = grads_y2 = tf.gradients(train_loss_y2, params_xy2) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_y2, var_list=params_xy2) train_op_y2, q_runner_y2 = optim.apply_gradients(list(zip(grads_y2, params_xy2))) self.q_runner_y2 = q_runner_y2 self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, sub3, sub4, sub5, sub6, sub7, sub8, sub9, sub10, sub11, sub12, x0, y0, x1, y1, x2, y2, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = {train_model.X:obs, A:actions, SUB3:sub3, SUB4:sub4, SUB5:sub5, SUB6:sub6, SUB7:sub7, SUB8:sub8, SUB9:sub9, SUB10:sub10, SUB11:sub11, SUB12:sub12, X0:x0, Y0:y0, X1:x1, Y1:y1, X2:x2, Y2:y2, ADV:advs, R:rewards, PG_LR:cur_lr} if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _, \ policy_loss_sub3, policy_entropy_sub3, _, \ policy_loss_sub4, policy_entropy_sub4, _, \ policy_loss_sub5, policy_entropy_sub5, _, \ policy_loss_sub6, policy_entropy_sub6, _, \ policy_loss_sub7, policy_entropy_sub7, _, \ policy_loss_sub8, policy_entropy_sub8, _, \ policy_loss_sub9, policy_entropy_sub9, _, \ policy_loss_sub10, policy_entropy_sub10, _, \ policy_loss_sub11, policy_entropy_sub11, _, \ policy_loss_sub12, policy_entropy_sub12, _, \ policy_loss_x0, policy_entropy_x0, _, \ policy_loss_y0, policy_entropy_y0, _ , \ policy_loss_x1, policy_entropy_x1, _ , \ policy_loss_y1, policy_entropy_y1, _ , \ policy_loss_x2, policy_entropy_x2, _ , \ policy_loss_y2, policy_entropy_y2, _ = sess.run( [pg_loss, vf_loss, entropy, train_op, pg_loss_sub3, entropy_sub3, train_op_sub3, pg_loss_sub4, entropy_sub4, train_op_sub4, pg_loss_sub5, entropy_sub5, train_op_sub5, pg_loss_sub6, entropy_sub6, train_op_sub6, pg_loss_sub7, entropy_sub7, train_op_sub7, pg_loss_sub8, entropy_sub8, train_op_sub8, pg_loss_sub9, entropy_sub9, train_op_sub9, pg_loss_sub10, entropy_sub10, train_op_sub10, pg_loss_sub11, entropy_sub11, train_op_sub11, pg_loss_sub12, entropy_sub12, train_op_sub12, pg_loss_x0, entropy_x0, train_op_x0, pg_loss_y0, entropy_y0, train_op_y0, pg_loss_x1, entropy_x1, train_op_x1, pg_loss_y1, entropy_y1, train_op_y1, pg_loss_x2, entropy_x2, train_op_x2, pg_loss_y2, entropy_y2, train_op_y2], td_map ) print("policy_loss : ", policy_loss, " value_loss : ", value_loss, " entropy : ", entropy) # policy_loss = 1 if(np.isinf(policy_loss)) else policy_loss # value_loss = 1 if(np.isinf(value_loss)) else value_loss # policy_entropy = 1 if(np.isinf(policy_entropy)) else policy_entropy # # policy_loss_sub3 = 1 if(np.isinf(policy_loss_sub3)) else policy_loss_sub3 # value_loss = 1 if(np.isinf(value_loss)) else value_loss # policy_entropy = 1 if(np.isinf(policy_entropy)) else policy_entropy return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state print("global_variables_initializer start") tf.global_variables_initializer().run(session=sess) print("global_variables_initializer complete")
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) #nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) SUB3 = tf.placeholder(tf.int32, [nbatch]) SUB4 = tf.placeholder(tf.int32, [nbatch]) SUB5 = tf.placeholder(tf.int32, [nbatch]) SUB6 = tf.placeholder(tf.int32, [nbatch]) SUB7 = tf.placeholder(tf.int32, [nbatch]) SUB8 = tf.placeholder(tf.int32, [nbatch]) SUB9 = tf.placeholder(tf.int32, [nbatch]) SUB10 = tf.placeholder(tf.int32, [nbatch]) SUB11 = tf.placeholder(tf.int32, [nbatch]) SUB12 = tf.placeholder(tf.int32, [nbatch]) X0 = tf.placeholder(tf.int32, [nbatch]) Y0 = tf.placeholder(tf.int32, [nbatch]) X1 = tf.placeholder(tf.int32, [nbatch]) Y1 = tf.placeholder(tf.int32, [nbatch]) X2 = tf.placeholder(tf.int32, [nbatch]) Y2 = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub3, labels=SUB3) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub4, labels=SUB4) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub5, labels=SUB5) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub6, labels=SUB6) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub7, labels=SUB7) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub8, labels=SUB8) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub9, labels=SUB9) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub10, labels=SUB10) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub11, labels=SUB11) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub12, labels=SUB12) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x0, labels=X0) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y0, labels=Y0) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x1, labels=X1) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y1, labels=Y1) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x2, labels=X2) \ + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y2, labels=Y2) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV * logpac) * tf.reduce_mean(ADV) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub3)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub4)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub5)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub6)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub7)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub8)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub9)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub10)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub11)) \ + tf.reduce_mean(cat_entropy(train_model.pi_sub12)) \ + tf.reduce_mean(cat_entropy(train_model.pi_x0)) \ + tf.reduce_mean(cat_entropy(train_model.pi_y0)) \ + tf.reduce_mean(cat_entropy(train_model.pi_x1)) \ + tf.reduce_mean(cat_entropy(train_model.pi_y1)) \ + tf.reduce_mean(cat_entropy(train_model.pi_x2)) \ + tf.reduce_mean(cat_entropy(train_model.pi_y2)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape( train_model.vf)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, \ momentum=0.9, kfac_update=1, epsilon=0.01, \ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, sub3, sub4, sub5, sub6, sub7, sub8, sub9, sub10, sub11, sub12, x0, y0, x1, y1, x2, y2, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, SUB3: sub3, SUB4: sub4, SUB5: sub5, SUB6: sub6, SUB7: sub7, SUB8: sub8, SUB9: sub9, SUB10: sub10, SUB11: sub11, SUB12: sub12, X0: x0, Y0: y0, X1: x1, Y1: y1, X2: x2, Y2: y2, ADV: advs, R: rewards, PG_LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) print("policy_loss : ", policy_loss, " value_loss : ", value_loss, " entropy : ", entropy) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state print("global_variables_initializer start") tf.global_variables_initializer().run(session=sess) print("global_variables_initializer complete")