class Model(object): def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV * logpac) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape( train_model.vf)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, PG_LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
class Model(object): def __init__(self, policy, ob_space, ac_space, n_envs, total_timesteps, nprocs=32, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear'): """ The ACKTR (Actor Critic using Kronecker-Factored Trust Region) model class, https://arxiv.org/abs/1708.05144 :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param ob_space: (Gym Space) The observation space :param ac_space: (Gym Space) The action space :param n_envs: (int) The number of environments :param total_timesteps: (int) The total number of timesteps for training the model :param nprocs: (int) The number of threads for TensorFlow operations :param n_steps: (int) The number of steps to run for each environment :param ent_coef: (float) The weight for the entropic loss :param vf_coef: (float) The weight for the loss on the value function :param vf_fisher_coef: (float) The weight for the fisher loss on the value function :param learning_rate: (float) The initial learning rate for the RMS prop optimizer :param max_grad_norm: (float) The clipping value for the maximum gradient :param kfac_clip: (float) gradient clipping for Kullback leiber :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') """ config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) n_batch = n_envs * n_steps action_ph = tf.placeholder(tf.int32, [n_batch]) advs_ph = tf.placeholder(tf.float32, [n_batch]) rewards_ph = tf.placeholder(tf.float32, [n_batch]) pg_lr_ph = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, n_envs, 1, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, n_envs * n_steps, n_steps, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.policy, labels=action_ph) self.logits = train_model.policy # training loss pg_loss = tf.reduce_mean(advs_ph * logpac) entropy = tf.reduce_mean(calc_entropy(train_model.policy)) pg_loss = pg_loss - ent_coef * entropy vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.value_fn + tf.random_normal( tf.shape(train_model.value_fn)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=pg_lr_ph, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.learning_rate = Scheduler(initial_value=learning_rate, n_values=total_timesteps, schedule=lr_schedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for _ in range(len(obs)): cur_lr = self.learning_rate.value() td_map = { train_model.obs_ph: obs, action_ph: actions, advs_ph: advs, rewards_ph: rewards, pg_lr_ph: cur_lr } if states is not None: td_map[train_model.states_ph] = states td_map[train_model.masks_ph] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): session_params = sess.run(params) joblib.dump(session_params, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for param, loaded_p in zip(params, loaded_params): restores.append(param.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
class Model(object): def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear', is_async=True): self.sess = sess = get_session() nbatch = nenvs * nsteps A = tf.placeholder(ac_space.dtype, [ nbatch, ] + list(ac_space.shape)) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE): self.model = step_model = policy(nenvs, 1, sess=sess) self.model2 = train_model = policy(nenvs * nsteps, nsteps, sess=sess) neglogpac = train_model.pd.neglogp(A) self.logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV * neglogpac) entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac) sample_net = train_model.vf + tf.random_normal(tf.shape( train_model.vf)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("acktr_model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm) # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, PG_LR: cur_lr, VF_LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy self.train = train self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
class Model(object): def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nscripts=16, nsteps=20, nstack=4, ent_coef=0.1, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.001, kfac_clip=0.001, lrschedule='linear', alpha=0.99, epsilon=1e-5): config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nsml.bind(sess=sess) #nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) XY0 = tf.placeholder(tf.int32, [nbatch]) XY1 = tf.placeholder(tf.int32, [nbatch]) # ADV == TD_TARGET - values ADV = tf.placeholder(tf.float32, [nbatch]) TD_TARGET = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy( sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model2 = train_model = policy( sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) # Policy 1 : Base Action : train_model.pi label = A script_mask = tf.concat( [ tf.zeros([nscripts * nsteps, 1]), tf.ones([(nprocs - nscripts) * nsteps, 1]) ], axis=0) pi = train_model.pi pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0 pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi, labels=A) neglogpac *= tf.stop_gradient(pac_weight) inv_A = 1.0 - tf.cast(A, tf.float32) xy0_mask = tf.cast(A, tf.float32) xy1_mask = tf.cast(A, tf.float32) condition0 = tf.equal(xy0_mask, 2) xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask) xy0_mask = 1.0 - xy0_mask condition1 = tf.equal(xy1_mask, 2) xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask) # One hot representation of chosen marine. # [batch_size, 2] pi_xy0 = train_model.pi_xy0 pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0 pac_weight = tf.reduce_sum( pac_weight * tf.one_hot(XY0, depth=1024), axis=1) logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy0, labels=XY0) logpac_xy0 *= tf.stop_gradient(pac_weight) logpac_xy0 *= tf.cast(xy0_mask, tf.float32) pi_xy1 = train_model.pi_xy1 pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0 pac_weight = tf.reduce_sum( pac_weight * tf.one_hot(XY0, depth=1024), axis=1) # 1D? 2D? logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy1, labels=XY1) logpac_xy1 *= tf.stop_gradient(pac_weight) logpac_xy1 *= tf.cast(xy1_mask, tf.float32) pg_loss = tf.reduce_mean(ADV * neglogpac) pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0) pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1) vf_ = tf.squeeze(train_model.vf) vf_r = tf.concat( [ tf.ones([nscripts * nsteps, 1]), tf.zeros([(nprocs - nscripts) * nsteps, 1]) ], axis=0) * TD_TARGET vf_masked = vf_ * script_mask + vf_r #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps] vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET)) entropy_a = tf.reduce_mean(cat_entropy(train_model.pi)) entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0)) entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1)) entropy = entropy_a + entropy_xy0 + entropy_xy1 loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) self.logits = logits = train_model.pi # xy0 self.params_common = params_common = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') self.params_xy0 = params_xy0 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy0 = grads_xy0 = tf.gradients( train_loss_xy0, params_xy0) if max_grad_norm is not None: grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm) grads_xy0 = list(zip(grads_xy0, params_xy0)) trainer_xy0 = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy0 = trainer_xy0.apply_gradients(grads_xy0) # xy1 self.params_xy1 = params_xy1 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy1 = grads_xy1 = tf.gradients( train_loss_xy1, params_xy1) if max_grad_norm is not None: grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm) grads_xy1 = list(zip(grads_xy1, params_xy1)) trainer_xy1 = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy1 = trainer_xy1.apply_gradients(grads_xy1) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, td_targets, masks, actions, xy0, xy1, values): advs = td_targets - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, XY0: xy0, XY1: xy1, ADV: advs, TD_TARGET: td_targets, PG_LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _, \ policy_loss_xy0, policy_entropy_xy0, _, \ policy_loss_xy1, policy_entropy_xy1, _ = sess.run( [pg_loss, vf_loss, entropy, _train, pg_loss_xy0, entropy_xy0, _train_xy0, pg_loss_xy1, entropy_xy1, _train_xy1], td_map) return policy_loss, value_loss, policy_entropy, \ policy_loss_xy0, policy_entropy_xy0, \ policy_loss_xy1, policy_entropy_xy1 def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state print("global_variables_initializer start") tf.global_variables_initializer().run(session=sess) print("global_variables_initializer complete")
def learn_hoof_a2c( network, env, seed=None, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, load_path=None, # Baselines default settings till here optimiser='RMSProp', lr_upper_bound=None, ent_upper_bound=None, num_lr=None, num_ent_coeff=None, max_kl=-1.0, # -1.0 is for no KL constraint **network_kwargs): ''' Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. Parameters: ----------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int, total number of timesteps to train on (default: 80M) max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) gamma: float, reward discounting parameter (default: 0.99) log_interval: int, specifies how frequently the logs are printed out (default: 100) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # overwrite default params if using HOOF if lr_upper_bound is not None: lr = 1.0 lrschedule = 'constant' else: num_lr = 1 if ent_upper_bound is None: num_ent_coeff = 1 # Instantiate the model object (that creates step_model and train_model) model = HOOF_Model( policy=policy, env=env, nsteps=nsteps, optimiser=optimiser, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, total_timesteps=total_timesteps, alpha=alpha, epsilon=epsilon # defaults for RMSProp ) runner = HOOF_Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Calculate the batch_size nbatch = nenvs * nsteps # model helper functions model_params = find_trainable_variables("a2c_model") get_flat = U.GetFlat(model_params) set_from_flat = U.SetFromFlat(model_params) # for Gaussian policies def kl(new_mean, new_sd, old_mean, old_sd): approx_kl = np.log(new_sd / old_sd) + ( old_sd**2 + (old_mean - new_mean)**2) / (2.0 * new_sd**2 + 10**-8) - 0.5 approx_kl = np.sum(approx_kl, axis=1) approx_kl = np.mean(approx_kl) return approx_kl if max_kl == -1.0: # set max kl to a high val in case there is no constraint max_kl = 10**8 # Start total timer tstart = time.time() for update in range(1, int(total_timesteps // nbatch + 1)): obs, states, rewards, masks, actions, values, undisc_rwds, epinfos = runner.run( ) epinfobuf.extend(epinfos) old_mean, old_sd, old_neg_ll = model.get_mean_std_neg_ll(obs, actions) for step in range(len(obs)): cur_lr = lr.value() opt_pol_val = -10**8 old_params = get_flat() rms_weights_before_upd = model.get_opt_state() approx_kl = np.zeros((num_ent_coeff, num_lr)) epv = np.zeros((num_ent_coeff, num_lr)) rand_lr = lr_upper_bound * np.random.rand( num_lr) if lr_upper_bound is not None else [cur_lr] rand_lr = np.sort(rand_lr) rand_ent_coeff = ent_upper_bound * np.random.rand( num_ent_coeff) if ent_upper_bound is not None else [ent_coef] for nec in range(num_ent_coeff): # reset policy and optimiser set_from_flat(old_params) model.set_opt_state(rms_weights_before_upd) # get grads for loss fn with given entropy coeff policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values, rand_ent_coeff[nec]) new_params = get_flat() ent_grads = new_params - old_params # enumerate over different LR for nlr in range(num_lr): new_params = old_params + rand_lr[nlr] * ent_grads set_from_flat(new_params) new_mean, new_sd, new_neg_ll = model.get_mean_std_neg_ll( obs, actions) lik_ratio = np.exp(-new_neg_ll + old_neg_ll) est_pol_val = wis_estimate(nenvs, nsteps, undisc_rwds, lik_ratio) approx_kl[nec, nlr] = kl(new_mean, new_sd, old_mean, old_sd) epv[nec, nlr] = est_pol_val if (nec == 0 and nlr == 0) or (est_pol_val > opt_pol_val and approx_kl[nec, nlr] < max_kl): opt_pol_val = est_pol_val opt_pol_params = get_flat() opt_rms_wts = model.get_opt_state() opt_lr = rand_lr[nlr] opt_ent_coeff = rand_ent_coeff[nec] opt_kl = approx_kl[nec, nlr] # update policy and rms prop to optimal wts set_from_flat(opt_pol_params) model.set_opt_state(opt_rms_wts) # Shrink LR search space if too many get rejected if lr_upper_bound is not None: rejections = np.sum(approx_kl > max_kl) / num_lr if rejections > 0.8: lr_upper_bound *= 0.8 if rejections == 0: lr_upper_bound *= 1.25 nseconds = time.time() - tstart # Calculate the fps (frame per second) fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("opt_lr", float(opt_lr)) logger.record_tabular("opt_ent_coeff", float(opt_ent_coeff)) logger.record_tabular("approx_kl", float(opt_kl)) if lr_upper_bound is not None: logger.record_tabular("rejections", rejections) logger.record_tabular("lr_ub", lr_upper_bound) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() return model
class Model(object): def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear', is_async=True): self.sess = sess = get_session() nbatch = nenvs * nsteps with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE): self.model = step_model = policy(nenvs, 1, sess=sess) self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) self.logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV*neglogpac) entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params=params = find_trainable_variables("acktr_model") self.grads_check = grads = tf.gradients(train_loss,params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm) # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads,params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr, VF_LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map ) return policy_loss, value_loss, policy_entropy self.train = train self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
class Model(object): def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV*logpac) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params=params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss,params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads,params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
class Model(object): def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nscripts=16, nsteps=20, nstack=4, ent_coef=0.1, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.001, kfac_clip=0.001, lrschedule='linear', alpha=0.99, epsilon=1e-5): config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nsml.bind(sess=sess) #nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) XY0 = tf.placeholder(tf.int32, [nbatch]) XY1 = tf.placeholder(tf.int32, [nbatch]) # ADV == TD_TARGET - values ADV = tf.placeholder(tf.float32, [nbatch]) TD_TARGET = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy( sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model2 = train_model = policy( sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) # Policy 1 : Base Action : train_model.pi label = A script_mask = tf.concat( [ tf.zeros([nscripts * nsteps, 1]), tf.ones([(nprocs - nscripts) * nsteps, 1]) ], axis=0) pi = train_model.pi pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0 pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi, labels=A) neglogpac *= tf.stop_gradient(pac_weight) inv_A = 1.0 - tf.cast(A, tf.float32) xy0_mask = tf.cast(A, tf.float32) xy1_mask = tf.cast(A, tf.float32) condition0 = tf.equal(xy0_mask, 2) xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask) xy0_mask = 1.0 - xy0_mask condition1 = tf.equal(xy1_mask, 2) xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask) # One hot representation of chosen marine. # [batch_size, 2] pi_xy0 = train_model.pi_xy0 pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0 pac_weight = tf.reduce_sum( pac_weight * tf.one_hot(XY0, depth=1024), axis=1) logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy0, labels=XY0) logpac_xy0 *= tf.stop_gradient(pac_weight) logpac_xy0 *= tf.cast(xy0_mask, tf.float32) pi_xy1 = train_model.pi_xy1 pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0 pac_weight = tf.reduce_sum( pac_weight * tf.one_hot(XY0, depth=1024), axis=1) # 1D? 2D? logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy1, labels=XY1) logpac_xy1 *= tf.stop_gradient(pac_weight) logpac_xy1 *= tf.cast(xy1_mask, tf.float32) pg_loss = tf.reduce_mean(ADV * neglogpac) pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0) pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1) vf_ = tf.squeeze(train_model.vf) vf_r = tf.concat( [ tf.ones([nscripts * nsteps, 1]), tf.zeros([(nprocs - nscripts) * nsteps, 1]) ], axis=0) * TD_TARGET vf_masked = vf_ * script_mask + vf_r #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps] vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET)) entropy_a = tf.reduce_mean(cat_entropy(train_model.pi)) entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0)) entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1)) entropy = entropy_a + entropy_xy0 + entropy_xy1 loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) self.logits = logits = train_model.pi # xy0 self.params_common = params_common = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') self.params_xy0 = params_xy0 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy0 = grads_xy0 = tf.gradients( train_loss_xy0, params_xy0) if max_grad_norm is not None: grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm) grads_xy0 = list(zip(grads_xy0, params_xy0)) trainer_xy0 = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy0 = trainer_xy0.apply_gradients(grads_xy0) # xy1 self.params_xy1 = params_xy1 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy1 = grads_xy1 = tf.gradients( train_loss_xy1, params_xy1) if max_grad_norm is not None: grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm) grads_xy1 = list(zip(grads_xy1, params_xy1)) trainer_xy1 = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy1 = trainer_xy1.apply_gradients(grads_xy1) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, td_targets, masks, actions, xy0, xy1, values): advs = td_targets - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, XY0: xy0, XY1: xy1, ADV: advs, TD_TARGET: td_targets, PG_LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _, \ policy_loss_xy0, policy_entropy_xy0, _, \ policy_loss_xy1, policy_entropy_xy1, _ = sess.run( [pg_loss, vf_loss, entropy, _train, pg_loss_xy0, entropy_xy0, _train_xy0, pg_loss_xy1, entropy_xy1, _train_xy1], td_map) return policy_loss, value_loss, policy_entropy, \ policy_loss_xy0, policy_entropy_xy0, \ policy_loss_xy1, policy_entropy_xy1 def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state print("global_variables_initializer start") tf.global_variables_initializer().run(session=sess) print("global_variables_initializer complete")