def update_loss_trainer(progress): # Update all coef regarding progression pg_coef = np.clip(self.start_pg_coef * (1.0 - 2.0 * progress), 0.0, 1.0) st_coef = np.clip(self.start_st_coef * (1.0 - 2.0 * progress), 0.0, 1.0) pg_lt_coef = np.clip(self.start_pg_lt_coef + 2.0 * progress, self.start_pg_lt_coef, 1.0) loss = pg_coef * pg_loss - entropy * ent_coef + ( vf_loss + vf_loss_lt ) * vf_coef + st_coef * st_loss + pg_lt_coef * pg_loss_lt # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if self.max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm( grads, self.max_grad_norm) grads = list(zip(grads, params)) _train = trainer.apply_gradients(grads) return pg_coef, st_coef, pg_lt_coef
def add_noise(self, mean=0.0, stddev=0.01): sess = tf_util.get_session() params = find_trainable_variables(self.model_name) for param in params: variables_shape = tf.shape(param) noise = tf.random_normal( variables_shape, mean=mean, stddev=stddev, dtype=tf.float32, ) sess.run(tf.assign_add(param, noise))
def get_train_op(loss_op): params = find_trainable_variables("model") # Switch from GATE_NONE to GATE_GRAPH to enhance reproducibility. #grads = tf.gradients(loss, params) grads_and_params = trainer.compute_gradients( loss=loss_op, var_list=params, gate_gradients=tf.train.RMSPropOptimizer.GATE_GRAPH) grads = [x[0] for x in grads_and_params] params = [x[1] for x in grads_and_params] if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) return trainer.apply_gradients(grads)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) # actions D = tf.placeholder(tf.float32, [nbatch]) # dones R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's LR = tf.placeholder(tf.float32, []) eps = 1e-6 step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True) params = find_trainable_variables("model") print("Params {}".format(len(params))) for var in params: print(var) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) print(v.name) return v with tf.variable_scope("", custom_getter=custom_getter, reuse=True): polyak_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i v = tf.reduce_sum(train_model.pi * train_model.q, axis=-1) # shape is [nenvs * (nsteps + 1)] # strip off last step f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model.pi, polyak_model.pi, train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, A) q_i = get_by_index(q, A) # Compute ratios for importance truncation rho = f / (MU + eps) rho_i = get_by_index(rho, A) # Calculate Q_retrace targets qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient( adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]) ) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2) gain_bc = tf.reduce_sum( logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis=1) #IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]] * 2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = -f_pol / ( f + eps ) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g / ( nenvs * nsteps ) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) grads = [ gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params) ] avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging run_ops = [ _train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ] names_ops = [ 'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: run_ops = run_ops + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] names_ops = names_ops + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] def train(obs, actions, rewards, dones, mus, states, masks, steps): cur_lr = lr.value_steps(steps) td_map = { train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks td_map[polyak_model.S] = states td_map[polyak_model.M] = masks return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) self.train = train self.save = save self.train_model = train_model self.step_model = step_model self.step = step_model.step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nscripts=16, nsteps=20, nstack=4, ent_coef=0.1, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.001, kfac_clip=0.001, lrschedule='linear', alpha=0.99, epsilon=1e-5): config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nsml.bind(sess=sess) #nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) XY0 = tf.placeholder(tf.int32, [nbatch]) XY1 = tf.placeholder(tf.int32, [nbatch]) # ADV == TD_TARGET - values ADV = tf.placeholder(tf.float32, [nbatch]) TD_TARGET = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy( sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model2 = train_model = policy( sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) # Policy 1 : Base Action : train_model.pi label = A script_mask = tf.concat( [ tf.zeros([nscripts * nsteps, 1]), tf.ones([(nprocs - nscripts) * nsteps, 1]) ], axis=0) pi = train_model.pi pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0 pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi, labels=A) neglogpac *= tf.stop_gradient(pac_weight) inv_A = 1.0 - tf.cast(A, tf.float32) xy0_mask = tf.cast(A, tf.float32) xy1_mask = tf.cast(A, tf.float32) condition0 = tf.equal(xy0_mask, 2) xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask) xy0_mask = 1.0 - xy0_mask condition1 = tf.equal(xy1_mask, 2) xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask) # One hot representation of chosen marine. # [batch_size, 2] pi_xy0 = train_model.pi_xy0 pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0 pac_weight = tf.reduce_sum( pac_weight * tf.one_hot(XY0, depth=1024), axis=1) logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy0, labels=XY0) logpac_xy0 *= tf.stop_gradient(pac_weight) logpac_xy0 *= tf.cast(xy0_mask, tf.float32) pi_xy1 = train_model.pi_xy1 pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0 pac_weight = tf.reduce_sum( pac_weight * tf.one_hot(XY0, depth=1024), axis=1) # 1D? 2D? logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy1, labels=XY1) logpac_xy1 *= tf.stop_gradient(pac_weight) logpac_xy1 *= tf.cast(xy1_mask, tf.float32) pg_loss = tf.reduce_mean(ADV * neglogpac) pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0) pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1) vf_ = tf.squeeze(train_model.vf) vf_r = tf.concat( [ tf.ones([nscripts * nsteps, 1]), tf.zeros([(nprocs - nscripts) * nsteps, 1]) ], axis=0) * TD_TARGET vf_masked = vf_ * script_mask + vf_r #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps] vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET)) entropy_a = tf.reduce_mean(cat_entropy(train_model.pi)) entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0)) entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1)) entropy = entropy_a + entropy_xy0 + entropy_xy1 loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) self.logits = logits = train_model.pi # xy0 self.params_common = params_common = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') self.params_xy0 = params_xy0 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy0 = grads_xy0 = tf.gradients( train_loss_xy0, params_xy0) if max_grad_norm is not None: grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm) grads_xy0 = list(zip(grads_xy0, params_xy0)) trainer_xy0 = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy0 = trainer_xy0.apply_gradients(grads_xy0) # xy1 self.params_xy1 = params_xy1 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy1 = grads_xy1 = tf.gradients( train_loss_xy1, params_xy1) if max_grad_norm is not None: grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm) grads_xy1 = list(zip(grads_xy1, params_xy1)) trainer_xy1 = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy1 = trainer_xy1.apply_gradients(grads_xy1) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, td_targets, masks, actions, xy0, xy1, values): advs = td_targets - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, XY0: xy0, XY1: xy1, ADV: advs, TD_TARGET: td_targets, PG_LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _, \ policy_loss_xy0, policy_entropy_xy0, _, \ policy_loss_xy1, policy_entropy_xy1, _ = sess.run( [pg_loss, vf_loss, entropy, _train, pg_loss_xy0, entropy_xy0, _train_xy0, pg_loss_xy1, entropy_xy1, _train_xy1], td_map) return policy_loss, value_loss, policy_entropy, \ policy_loss_xy0, policy_entropy_xy0, \ policy_loss_xy1, policy_entropy_xy1 def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state print("global_variables_initializer start") tf.global_variables_initializer().run(session=sess) print("global_variables_initializer complete")
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps writter = tf.summary.FileWriter( "/tmp/a2c_demo/1") # Change for SAT: this is to use tensorBoard A = tf.placeholder( tf.int32, [nbatch]) # Comments by Fei: this must be the action ADV = tf.placeholder( tf.float32, [nbatch]) # Comments by Fei: this must be the advantage R = tf.placeholder( tf.float32, [nbatch]) # Comments by Fei: this must be the reward LR = tf.placeholder( tf.float32, []) # Comments by Fei: this must be the learning rate step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) # Comments by Fei: pi is nbatch * nact pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) # writter.add_graph(sess.graph) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', logdir=None): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs*nsteps ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=train_model.a0) entropy = tf.reduce_sum(cat_entropy(train_model.pi)) params = find_trainable_variables("model") tf.summary.histogram("vf", train_model.vf) tf.summary.histogram("R", R) if train_model.relaxed: pg_loss = tf.constant(0.0) oh_A = tf.one_hot(train_model.a0, ac_space.n) params = find_trainable_variables("model") policy_params = [v for v in params if "pi" in v.name] vf_params = [v for v in params if "vf" in v.name] entropy_grads = tf.gradients(entropy, policy_params) ddiff_loss = tf.reduce_sum(train_model.vf - train_model.vf_t) ddiff_grads = tf.gradients(ddiff_loss, policy_params) sm = tf.nn.softmax(train_model.pi) dlogp_dpi = oh_A * (1. - sm) + (1. - oh_A) * (-sm) pi_grads = -((tf.expand_dims(R, 1) - train_model.vf_t) * dlogp_dpi) pg_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads) pg_grads = [pg - dg for pg, dg in zip(pg_grads, ddiff_grads)] pi_param_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads) cv_grads = tf.concat([tf.reshape(p, [-1]) for p in pg_grads], 0) cv_grad_splits = tf.reduce_sum(tf.square(cv_grads)) vf_loss = cv_grad_splits * vf_coef cv_grads = tf.gradients(vf_loss, vf_params) policy_grads = [] for e_grad, p_grad, param in zip(entropy_grads, pg_grads, policy_params): grad = -e_grad * ent_coef + p_grad policy_grads.append(grad) grad_dict = {} for g, v in list(zip(policy_grads, policy_params))+list(zip(cv_grads, vf_params)): grad_dict[v] = g grads = [grad_dict[v] for v in params] print(grads) else: pg_loss = tf.reduce_sum((tf.stop_gradient(R) - tf.stop_gradient(train_model.vf)) * neglogpac) policy_params = [v for v in params if "pi" in v.name] pg_grads = tf.gradients(pg_loss, policy_params) vf_loss = tf.reduce_sum(mse(tf.squeeze(train_model.vf), R)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef grads = tf.gradients(loss, params) grads = list(zip(grads, params)) ema = tf.train.ExponentialMovingAverage(.99) all_policy_grads = tf.concat([tf.reshape(g, [-1]) for g in pg_grads], 0) all_policy_grads_sq = tf.square(all_policy_grads) apply_mean_op = ema.apply([all_policy_grads, all_policy_grads_sq]) em_mean = ema.average(all_policy_grads) em_mean_sq = ema.average(all_policy_grads_sq) em_var = em_mean_sq - tf.square(em_mean) em_log_var = tf.log(em_var + 1e-20) mlgv = tf.reduce_mean(em_log_var) for g, v in grads: print(v.name, g) tf.summary.histogram(v.name, v) tf.summary.histogram(v.name+"_grad", g) self.sum_op = tf.summary.merge_all() self.writer = tf.summary.FileWriter(logdir) trainer = tf.train.AdamOptimizer(learning_rate=LR, beta2=.99999) with tf.control_dependencies([apply_mean_op]): _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) self._step = 0 def train(obs, states, rewards, masks, u1, u2, values, summary=False): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X:obs, train_model.U1:u1, train_model.U2:u2, ADV:advs, R:rewards, LR:cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks if summary: sum_str, policy_loss, value_loss, policy_entropy, lv, _ = sess.run( [self.sum_op, pg_loss, vf_loss, entropy, mlgv, _train], td_map ) self.writer.add_summary(sum_str, self._step) else: policy_loss, value_loss, policy_entropy, lv, _ = sess.run( [pg_loss, vf_loss, entropy, mlgv, _train], td_map ) self._step += 1 return policy_loss, value_loss, policy_entropy, lv def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def load_variables_from_another_model(self, another_model_name): sess = tf_util.get_session() params = find_trainable_variables(self.model_name) another_params = find_trainable_variables(another_model_name) for pair in zip(params, another_params): sess.run(tf.assign(pair[0], pair[1]))
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV*logpac) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params=params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss,params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads,params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, master_ts = 1, worker_ts = 30, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, cell = 256, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', algo='regular', beta=1e-3): print('Create Session') gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) nact = ac_space.n nbatch = nenvs*master_ts*worker_ts A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, 1, cell = cell, model='step_model', algo=algo) train_model = policy(sess, ob_space, ac_space, nbatch, master_ts, worker_ts, model='train_model', algo=algo) print('model_setting_done') #loss construction neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.wvf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.wpi)) pg_loss = pg_loss - entropy * ent_coef print('algo: ', algo, 'max_grad_norm: ', str(max_grad_norm)) try: if algo == 'regular': loss = pg_loss + vf_coef * vf_loss elif algo == 'VIB': ''' implement VIB here, apart from the vf_loss and pg_loss, there should be a third loss, the kl_loss = ds.kl_divergence(model.encoding, prior), where prior is a Gaussian distribution with mu=0, std=1 the final loss should be pg_loss + vf_coef * vf_loss + beta*kl_loss ''' prior = ds.Normal(0.0, 1.0) kl_loss = tf.reduce_mean(ds.kl_divergence(train_model.encoding, prior)) loss = pg_loss + vf_coef * vf_loss + beta*kl_loss # pass else: raise Exception('Algorithm not exists') except Exception as e: print(e) grads, global_norm = grad_clip(loss, max_grad_norm, ['model']) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(wobs, whs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(whs)): cur_lr = lr.value() td_map = {train_model.wX:wobs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.wS] = states td_map[train_model.wM] = masks ''' you can add and run additional loss for VIB here for debugging, such as kl_loss ''' tloss, value_loss, policy_loss, policy_entropy, _ = sess.run( [loss, vf_loss, pg_loss, entropy, _train], feed_dict=td_map ) return tloss, value_loss, policy_loss, policy_entropy params = find_trainable_variables("model") def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.wvalue self.get_wh = step_model.get_wh self.initial_state = step_model.w_initial_state self.train = train self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, env, nsteps, ent_coef=1, vf_coef=0.5, max_grad_norm=0.5, fisher_matrix=None, star_param=None, lam=None, batch_size=None, fisher_matrix2=None, star_param2=None): sess = tf_util.get_session() nbatch = batch_size with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # train_model is used to train our network train_model = policy(nbatch, 1, sess) eval_model = policy(1, 1, sess) # Update parameters using lossa. # 1. Get the model parameters params = find_trainable_variables("a2c_model") print(len(params)) ewc_loss = 0 ewc_per_layer = [] j = 0 #star_param是load进去的array|params是模型的参数,也是tensor for v in range(len(params) - 4): if v % 2 == 0: C = star_param[v].shape[-1] W = tf.transpose(tf.reshape(params[v] - star_param[v], [-1, C])) right = tf.reshape( tf.transpose( tf.matmul(tf.matmul(fisher_matrix[j + 1], W), fisher_matrix[j])), [-1, 1]) W_ = tf.reshape(params[v] - star_param[v], [1, -1]) ewc_loss += tf.matmul(W_, right) ewc_per_layer.append(tf.matmul(W_, right)) j = j + 2 else: B = tf.reshape(params[v] - star_param[v], [-1, 1]) right_B = tf.matmul(fisher_matrix[j], B) B_ = tf.reshape(params[v] - star_param[v], [1, -1]) ewc_loss += tf.matmul(B_, right_B) ewc_per_layer.append(tf.matmul(B_, right_B)) j += 1 ewc_loss2 = 0 j = 0 for v in range(len(params) - 4): if v % 2 == 0: C2 = star_param2[v].shape[-1] W2 = tf.transpose( tf.reshape(params[v] - star_param2[v], [-1, C2])) right2 = tf.reshape( tf.transpose( tf.matmul(tf.matmul(fisher_matrix2[j + 1], W2), fisher_matrix2[j])), [-1, 1]) W_2 = tf.reshape(params[v] - star_param2[v], [1, -1]) ewc_loss2 += tf.matmul(W_2, right2) j = j + 2 else: B2 = tf.reshape(params[v] - star_param2[v], [-1, 1]) right_B2 = tf.matmul(fisher_matrix2[j], B2) B_2 = tf.reshape(params[v] - star_param2[v], [1, -1]) ewc_loss2 += tf.matmul(B_2, right_B2) j += 1 loss1 = ewc_loss * (lam / 2) loss2 = ewc_loss2 * (lam / 2) loss = loss1 + loss2 # 2. Calculate the gradients trainer = tf.train.AdamOptimizer(learning_rate=1e-3) # grads_and_var = trainer.compute_gradients(loss, params) # grads, var = zip(*grads_and_var) # grads, _grad_norm = tf.clip_by_global_norm(grads, 0.5) # # grads_and_var = list(zip(grads, var)) grads = tf.gradients(loss, params) grads_and_var = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C # trainer = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=alpha, epsilon=epsilon) # trainer = tf.train.MomentumOptimizer(learning_rate=0.001,momentum=0.9,use_nesterov=True) _train = trainer.apply_gradients(grads_and_var) # # # grads2 = tf.gradients(loss2,params) # grads2 = list(zip(grads2, params)) # trainer2 = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=alpha, epsilon=epsilon) # trainer2 = tf.train.AdamOptimizer(learning_rate=1e-4) # trainer2 = tf.train.MomentumOptimizer(learning_rate=0.001, momentum=0.9, use_nesterov=True) # _train2 = trainer2.apply_gradients(grads2) def train(): td_map = {train_model.keep_prob: 1.0} ewc1, ewc2, l, _ = sess.run([loss1, loss2, loss, _train], td_map) return ewc1, ewc2, l # KL,ewc,_ = sess.run([KL_loss,ewc_loss,_train],td_map) # return KL, ewc def creat_star_param(): star_list = [] for i in params[:-2]: star_list.append(star_param[params[i].name]) return star_list self.creat_star_param = creat_star_param self.train = train self.train_model = train_model self.act = eval_model.step self.act2 = eval_model.step2 self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, nenvs, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear', is_async=True): sess = get_session() nbatch = nenvs * nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) eval_model = policy(1, 1, sess) # A = train_model.pdtype.sample_placeholder([None]) # A = tf.placeholder(step_model.action.dtype, step_model.action.shape) probs = tf.nn.softmax(step_model.pi) class_ind = tf.to_int32(tf.multinomial(tf.log(probs), 1)[0][0]) self.pg_fisher = pg_fisher_loss = tf.log(probs[0, class_ind]) ##Fisher loss construction # self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac) # sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) # self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) # self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.joint_fisher = joint_fisher_loss = pg_fisher_loss self.params = params = find_trainable_variables("a2c_model") with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer() # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) stats = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params[:-4]) def compute_fisher(obs): # action = action[:, np.newaxis] td_map = {step_model.X: obs, step_model.keep_prob: 1.0} fisher = sess.run(stats, td_map) return fisher self.compute_fisher = compute_fisher self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear', is_async=True): self.sess = sess = get_session() nbatch = nenvs * nsteps with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE): self.model = step_model = policy(nenvs, 1, sess=sess) self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) self.logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV*neglogpac) entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params=params = find_trainable_variables("acktr_model") self.grads_check = grads = tf.gradients(train_loss,params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm) # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads,params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr, VF_LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map ) return policy_loss, value_loss, policy_entropy self.train = train self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs*nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', param=None): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False, param=param) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True, param=param) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear', is_async=True): self.sess = sess = get_session() nbatch = nenvs * nsteps with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE): self.model = step_model = policy(nenvs, 1, sess=sess) self.model2 = train_model = policy(nenvs * nsteps, nsteps, sess=sess) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) self.logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV * neglogpac) entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac) sample_net = train_model.vf + tf.random_normal(tf.shape( train_model.vf)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("acktr_model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm) # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, PG_LR: cur_lr, VF_LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy self.train = train self.save = functools.partial(save_variables, sess=sess) self.load = functools.partial(load_variables, sess=sess) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, p, has_state): """ policy : Internal Policy model such as SnakeModel.CNNPolicy p : Hyperparameters required for training """ sess = tf_util.make_session() # Tensorflow model initiallization step_model = policy(sess=sess, p=p, train_phase=False, has_state=has_state) # Deploy model settings train_model = policy(sess=sess, p=p, train_phase=True, has_state=has_state) # Training model settings saver = tf.train.Saver() #Step 2 : Initialize the training parameters A = tf.placeholder(tf.int32, [p.N_BATCH]) ADV = tf.placeholder(tf.float32, [p.N_BATCH]) R = tf.placeholder(tf.float32, [p.N_BATCH]) LR = tf.placeholder(tf.float32, []) #Step 3 : Define the loss Function neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) # pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * p.ENTROPY_COEFF + vf_loss * p.VALUE_FUNC_COEFF #Step 4 : Define the loss optimizer params = find_trainable_variables("model") grads = tf.gradients(loss, params) if p.MAX_GRAD_NORM is not None: grads, grad_norm = tf.clip_by_global_norm( grads, p.MAX_GRAD_NORM ) # Clipping the gradients to protect learned weights grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=p.RMS_DECAY, epsilon=p.EPSILON) _train = trainer.apply_gradients( grads) # This is the variable which will be used lr = Scheduler(v=p.LEARNING_RATE, nvalues=p.N_TIMESTEPS, schedule=p.LEARNING_RATE_SCHEDULE ) # Learning rate changes linearly or as per arguments # Step 5 : Write down the summary parameters to be used writer = tf.summary.FileWriter(p.LOG_PATH) #summary writer def train(obs, rewards, masks, actions, values, states): """ obs : batch x n x m x 1 snake matrix rewards : batch x 1 rewards corrosponding to action actions : batch x 1 discrete action taken values : batch x 1 output of value function during the training process """ advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, train_model.S: states, A: actions, ADV: advs, R: rewards, LR: cur_lr } policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): #ps = sess.run(params) #make_path(save_path) #joblib.dump(ps, save_path) saver.save(sess, save_path) def load(load_path): #loaded_params = joblib.load(load_path) #restores = [] #for p, loaded_p in zip(params, loaded_params): # restores.append(p.assign(loaded_p)) #ps = sess.run(restores) saver.restore(sess, load_path) def add_scalar_summary(tag, value, step): summary = tf.Summary( value=[tf.Summary.Value(tag=tag, simple_value=value)]) writer.add_summary(summary, step) # Expose the user to closure functions self.train = train self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.hidden_value = step_model.hidden_value self.initial_state = step_model.initial_state self.add_scalar_summary = add_scalar_summary self.save = save self.load = load # Initialize global variables and add tf graph tf.global_variables_initializer().run(session=sess) writer.add_graph(tf.get_default_graph()) #write graph
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, lambda_dist=0.01, total_timesteps=None, lrschedule='linear'): sess = tf.get_default_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) config = Config() act_model = policy(config) config.reuse = True train_model = policy(config) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.logits, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.logits)) aux_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.rp_logits, labels=A) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + aux_loss * lambda_dist params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) saver = tf.train.Saver() def train(obs, rs, rr, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr, train_model.inputs_s: rs, train_model.inputs_r: rr } policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): saver.save(sess, save_path + 'model.ckpt') def load(load_path): saver.restore(sess, load_path + 'model.ckpt') self.train = train self.train_model = train_model self.act_model = act_model self.act = act_model.act self.value = act_model.value self.save = save self.load = load
def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', network='cnn', prio_args=None): self.prio_args = prio_args sess = tf_util.get_session() nenvs = self.get_active_envs(env) nbatch = nenvs * nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) # our TD evaluating network A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Value loss vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) # TD loss # td_loss = losses.mean_squared_error(tf.squeeze(train_model.dt), TD) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef """prio model""" with tf.variable_scope('a2c_model_prio', reuse=tf.AUTO_REUSE): # prio_model = policy(nbatch, nsteps, sess) prio_model = MyNN(env, nbatch, network) P_R = tf.placeholder(tf.float32, [nbatch]) PRIO = tf.placeholder(tf.float32, [nbatch]) P_LR = tf.placeholder(tf.float32, []) # prio_model_loss = losses.mean_squared_error(tf.squeeze(prio_model.out), P_R) # Reward prio_model_loss = losses.mean_squared_error(tf.squeeze(prio_model.out), PRIO) # TD Error # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") params_prio = find_trainable_variables("a2c_model_prio") # 2. Calculate the gradients grads = tf.gradients(loss, params) prio_grads = tf.gradients(prio_model_loss, params_prio) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) prio_grads, prio_grad_norm = tf.clip_by_global_norm( prio_grads, max_grad_norm) grads = list(zip(grads, params)) prio_grads = list(zip(prio_grads, params_prio)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) prio_trainer = tf.train.RMSPropOptimizer(learning_rate=P_LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) _prio_train = prio_trainer.apply_gradients(prio_grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) prio_loss = 0 if self.prio_args is not None: prio_values = GetValuesForPrio(self.prio_args['prio_type'], self.prio_args['prio_param'], advs, rewards) prio_td_map = { prio_model.X: obs, P_R: rewards, P_LR: cur_lr, PRIO: prio_values } prio_loss, _, p_td = sess.run( [prio_model_loss, _prio_train, PRIO], prio_td_map) # mb aranged as 1D-vector = [[env_1: n1, ..., n_nstep],...,[env_n_active]] # need to take last value of each env's buffer self.prio_score = prio_values[list( filter(lambda x: x % nsteps == (nsteps - 1), range(len(prio_values))))] return policy_loss, value_loss, policy_entropy, prio_loss self.train = train self.train_model = train_model self.step_model = step_model self.prio_model = prio_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, env, nsteps, ent_coef=1, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', fisher_matrix=None, star_param=None, lam=None, batch_size=None): sess = tf_util.get_session() nbatch = batch_size with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # train_model is used to train our network train_model = policy(nbatch, 1, sess) eval_model = policy(1, 1, sess) OUTPUT = tf.placeholder(tf.float32, [None, 6], name='sample_action') # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # constant = tf.constant(0.01,shape=train_model.pi.get_shape()) pi = tf.nn.softmax(OUTPUT) # u = tf.argmax(pi,axis=-1) # x = tf.one_hot(u, pi.get_shape().as_list()[-1]) # cross_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=train_model.pi,labels=x)) train_model_pi = tf.nn.softmax(train_model.pi3) KL = pi * tf.log(pi / (train_model_pi + 1e-4)) KL_loss = tf.reduce_mean(tf.reduce_sum(KL, 1)) # Update parameters using lossa. # 1. Get the model parameters params = find_trainable_variables("a2c_model") print(len(params)) ewc_loss = 0 #star_param是load进去的array|params是模型的参数,也是tensor # for i in range(len(fisher_matrix)): # j = 0 # for v in range(len(params)-8): # if v % 2 == 0: # C = star_param[v].shape[-1] # W = tf.transpose(tf.reshape(params[v] - star_param[v],[-1,C])) # right = tf.reshape(tf.transpose(tf.matmul(tf.matmul(fisher_matrix[i][j+1], W), fisher_matrix[i][j])),[-1,1]) # W_ = tf.reshape(params[v] - star_param[v], [1,-1]) # ewc_loss += tf.matmul(W_, right) # j = j + 2 # else: # B = tf.reshape(params[v] - star_param[v], [-1,1]) # right_B = tf.matmul(fisher_matrix[i][j], B) # B_ = tf.reshape(params[v] - star_param[v], [1,-1]) # ewc_loss += tf.matmul(B_, right_B) # j += 1 # for v in range(len(params)-4): # if v % 2 == 0: # C = int(star_param[v].shape[-1]) # W_hat = tf.concat([tf.reshape(params[v], [-1, C]), tf.reshape(params[v + 1], [1, -1])], 0) # W_hat_fixed = tf.concat( # [tf.reshape(star_param[v], [-1, C]), tf.reshape(star_param[v + 1], [1, -1])], 0) # # W = tf.transpose(W_hat - W_hat_fixed) # right = tf.reshape(tf.transpose(tf.matmul(tf.matmul(fisher_matrix[v + 1], W), fisher_matrix[v])),[-1, 1]) # W_ = tf.reshape(W_hat - W_hat_fixed, [1, -1]) # ewc_loss += tf.matmul(W_, right) for i in range(len(fisher_matrix)): for v in range(len(params) - 6): # if v == 6: ewc_loss += tf.reduce_sum( tf.multiply(fisher_matrix[i][v].astype(np.float32), tf.square(params[v] - star_param[v]))) loss1 = KL_loss * ent_coef loss2 = ewc_loss * (lam / 2) loss = loss1 + loss2 # 2. Calculate the gradients trainer = tf.train.AdamOptimizer(learning_rate=1e-3) # trainer = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=alpha, epsilon=epsilon) # grads_and_var = trainer.compute_gradients(loss, params) # grads, var = zip(*grads_and_var) # grads, _grad_norm = tf.clip_by_global_norm(grads, 0.5) # # grads_and_var = list(zip(grads, var)) grads = tf.gradients(loss, params) grads_and_var = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C # trainer = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=alpha, epsilon=epsilon) # trainer = tf.train.MomentumOptimizer(learning_rate=0.001,momentum=0.9,use_nesterov=True) _train = trainer.apply_gradients(grads_and_var) # # # grads2 = tf.gradients(loss2,params) # grads2 = list(zip(grads2, params)) # trainer2 = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=alpha, epsilon=epsilon) # trainer2 = tf.train.AdamOptimizer(learning_rate=1e-4) # trainer2 = tf.train.MomentumOptimizer(learning_rate=0.001, momentum=0.9, use_nesterov=True) # _train2 = trainer2.apply_gradients(grads2) def train(obs, actions): td_map = { train_model.X: obs, OUTPUT: actions, train_model.keep_prob: 1.0 } kl, ewc, l, _ = sess.run([KL_loss, ewc_loss, loss, _train], td_map) return kl, ewc, l # KL,ewc,_ = sess.run([KL_loss,ewc_loss,_train],td_map) # return KL, ewc def creat_star_param(): star_list = [] for i in params[:-2]: star_list.append(star_param[params[i].name]) return star_list self.creat_star_param = creat_star_param self.train = train self.train_model = train_model self.act = eval_model.step self.act2 = eval_model.step2 self.act3 = eval_model.step3 self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def learn_vanilla_a2c( network, env, optimiser, seed=None, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, load_path=None, **network_kwargs): ''' Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. Parameters: ----------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int, total number of timesteps to train on (default: 80M) vf_coef: float, coefficient in front of value function loss in the total loss function (default: 0.5) ent_coef: float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01) max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting parameter (default: 0.99) log_interval: int, specifies how frequently the logs are printed out (default: 100) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # Instantiate the model object (that creates step_model and train_model) model = Model(optimiser=optimiser, policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = VanillaRunner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) # Calculate the batch_size nbatch = nenvs*nsteps def kl(new_mean, new_sd, old_mean, old_sd): # added for logging approx_kl = np.log(new_sd/old_sd) + (old_sd**2 + (old_mean - new_mean)**2)/(2.0*new_sd**2 + 10**-8) - 0.5 approx_kl = np.sum(approx_kl, axis=1) approx_kl = np.mean(approx_kl) return approx_kl # model helper functions model_params = find_trainable_variables("a2c_model") get_flat = U.GetFlat(model_params) # Start total timer tstart = time.time() for update in range(1, int(total_timesteps//nbatch+1)): old_params = get_flat() # Get mini batch of experiences obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) old_mean, old_sd = model.get_mean_std(obs) # added policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) nseconds = time.time()-tstart #added new_mean, new_sd = model.get_mean_std(obs) approx_kl = kl(new_mean, new_sd, old_mean, old_sd) new_params = get_flat() grads = new_params - old_params # Calculate the fps (frame per second) fps = int((update*nbatch)/nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update*nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("approx_kl", float(approx_kl)) logger.record_tabular("grad_norm", np.linalg.norm(grads)) logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() return model
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef, vf_coef, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nbatch = nenvs * nsteps step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) A = train_model.pdtype.sample_placeholder([nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) eps = 1e-6 #nadv = ADV / (train_model.ret_rms.std + eps) #nr = (R - train_model.ret_rms.mean) / (train_model.ret_rms.std + eps) nadv = (ADV - train_model.ret_rms.mean) / (train_model.ret_rms.std + eps) nr = (R - train_model.ret_rms.mean) / (train_model.ret_rms.std + eps) nlogpac = -train_model.pd.logp(A) pg_loss = tf.reduce_mean(nadv * nlogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), nr)) #vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vnorm), nr)) entropy = tf.reduce_mean(train_model.pd.entropy()) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) avg_norm_ret = tf.reduce_mean(tf.abs(train_model.ret_rms.mean)) avg_norm_obs = tf.reduce_mean(tf.abs(train_model.ob_rms.mean)) def train(obs, states, returns, masks, actions, values): advs = returns - values #advs = (advs - np.mean(advs)) / (np.std(advs) + eps) for step in range(len(obs)): cur_lr = lr.value() if hasattr(train_model, "ob_rms"): train_model.ob_rms.update( sess, obs) # update running mean/std for observations of policy if hasattr(train_model, "ret_rms"): train_model.ret_rms.update( sess, returns) # # update running mean/std for returns td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks ravg_norm_obs, policy_loss, value_loss, policy_entropy, _ = sess.run( [avg_norm_obs, pg_loss, vf_loss, entropy, _train], td_map) return ravg_norm_obs, policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta, icm ): sess = get_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) # actions D = tf.placeholder(tf.float32, [nbatch]) # dones R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's LR = tf.placeholder(tf.float32, []) eps = 1e-6 step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape) train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape) with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE): step_model = policy(nbatch=nenvs , nsteps=1,observ_placeholder=step_ob_placeholder, sess=sess) train_model = policy(nbatch=nbatch , nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess) params = find_trainable_variables("acer_model") print("Params {}".format(len(params))) # for var in params: # print(var) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) print(v.name) return v with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True): polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # shape is [n_envs * (n_steps + 1)] # action probability distributions according to train_model, polyak_model and step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(train_model.pi) polyak_model_p = tf.nn.softmax(polyak_model.pi) step_model_p = tf.nn.softmax(step_model.pi) # train model policy probility and train model q value v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)] # strip off last step # dictribution_f , f_polyak, q_value f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, A) q_i = get_by_index(q, A) # Compute ratios for importance truncation rho = f / (MU + eps) rho_i = get_by_index(rho, A) # Calculate Q_retrace targets # passed # R = reward , D = done_ph , v = value ,... rest is same qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # f is distribution here # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) # v is value here check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling if icm is None : adv = qret - v # v is value here else : # print("Adv Normalization") # > Advantage Normalization adv = qret - v # m , s = get_mean_and_std(icm_adv) # advs = (icm_adv - m) / (s + 1e-7) # > Advantage Normalization logf = tf.log(f_i + eps) # c is correction term # importance weight clipping factor : c gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2) gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f loss_bc= -tf.reduce_mean(gain_bc) # IMP: This is sum, as expectation wrt f loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]]*2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)] avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) if icm is not None : # print("with ICM") grads = grads + icm.pred_grads_and_vars trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging if icm is not None : # print("With ICM") run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads , icm.forw_loss , icm.inv_loss, icm.icm_loss] names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ,'icm.forw_loss' , 'icm.inv_loss', 'icm.icm_loss' ] if trust_region: run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] else : run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ] names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj ] names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj' ] def train(obs, actions, rewards, dones, mus, states, masks, steps, next_states, icm_actions ): cur_lr = lr.value_steps(steps) if icm is not None : print("with ICM ") td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr , icm.state_:obs, icm.next_state_ : next_states , icm.action_ : icm_actions} else : td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks td_map[polyak_model.S] = states td_map[polyak_model.M] = masks return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train def _step(observation, **kwargs): return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs) self.train = train self.save = functools.partial(save_variables, sess=sess, variables=params) self.train_model = train_model self.step_model = step_model self._step = _step self.step = self.step_model.step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta): sess = get_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) # actions D = tf.placeholder(tf.float32, [nbatch]) # dones R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's LR = tf.placeholder(tf.float32, []) eps = 1e-6 step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape) train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape) with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE): step_model = policy(nbatch=nenvs, nsteps=1, observ_placeholder=step_ob_placeholder, sess=sess) train_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess) params = find_trainable_variables("acer_model") print("Params {}".format(len(params))) for var in params: print(var) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) def custom_getter(getter, *args, **kwargs): v = ema.average(getter(*args, **kwargs)) print(v.name) return v with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True): polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i # action probability distributions according to train_model, polyak_model and step_model # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax train_model_p = tf.nn.softmax(train_model.pi) polyak_model_p = tf.nn.softmax(polyak_model.pi) step_model_p = tf.nn.softmax(step_model.pi) v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)] # strip off last step f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, A) q_i = get_by_index(q, A) # Compute ratios for importance truncation rho = f / (MU + eps) rho_i = get_by_index(rho, A) # Calculate Q_retrace targets qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps)) entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2) gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f loss_bc= -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]]*2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) loss = loss_policy + q_coef * loss_q - ent_coef * entropy if trust_region: g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)] avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads] names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads'] if trust_region: run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj] names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'] def train(obs, actions, rewards, dones, mus, states, masks, steps): cur_lr = lr.value_steps(steps) td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks td_map[polyak_model.S] = states td_map[polyak_model.M] = masks return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train def _step(observation, **kwargs): return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs) self.train = train self.save = functools.partial(save_variables, sess=sess, variables=params) self.train_model = train_model self.step_model = step_model self._step = _step self.step = self.step_model.step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, n_envs, total_timesteps, nprocs=32, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear'): """ The ACKTR (Actor Critic using Kronecker-Factored Trust Region) model class, https://arxiv.org/abs/1708.05144 :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param ob_space: (Gym Space) The observation space :param ac_space: (Gym Space) The action space :param n_envs: (int) The number of environments :param total_timesteps: (int) The total number of timesteps for training the model :param nprocs: (int) The number of threads for TensorFlow operations :param n_steps: (int) The number of steps to run for each environment :param ent_coef: (float) The weight for the entropic loss :param vf_coef: (float) The weight for the loss on the value function :param vf_fisher_coef: (float) The weight for the fisher loss on the value function :param learning_rate: (float) The initial learning rate for the RMS prop optimizer :param max_grad_norm: (float) The clipping value for the maximum gradient :param kfac_clip: (float) gradient clipping for Kullback leiber :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') """ config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) n_batch = n_envs * n_steps action_ph = tf.placeholder(tf.int32, [n_batch]) advs_ph = tf.placeholder(tf.float32, [n_batch]) rewards_ph = tf.placeholder(tf.float32, [n_batch]) pg_lr_ph = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, n_envs, 1, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, n_envs * n_steps, n_steps, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.policy, labels=action_ph) self.logits = train_model.policy # training loss pg_loss = tf.reduce_mean(advs_ph * logpac) entropy = tf.reduce_mean(calc_entropy(train_model.policy)) pg_loss = pg_loss - ent_coef * entropy vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.value_fn + tf.random_normal( tf.shape(train_model.value_fn)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=pg_lr_ph, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.learning_rate = Scheduler(initial_value=learning_rate, n_values=total_timesteps, schedule=lr_schedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for _ in range(len(obs)): cur_lr = self.learning_rate.value() td_map = { train_model.obs_ph: obs, action_ph: actions, advs_ph: advs, rewards_ph: rewards, pg_lr_ph: cur_lr } if states is not None: td_map[train_model.states_ph] = states td_map[train_model.masks_ph] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): session_params = sess.run(params) joblib.dump(session_params, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for param, loaded_p in zip(params, loaded_params): restores.append(param.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ent_coef=1, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', batch_size=None): sess = tf_util.get_session() nbatch = batch_size with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # train_model is used to train our network train_model = policy(nbatch, 1, sess) eval_model = policy(1, 1, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) OUTPUT = tf.placeholder(tf.float32, [None, 6]) # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss pi = tf.nn.softmax(OUTPUT) train_model_pi = tf.nn.softmax(train_model.pi2) # Policy loss # neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) # entropy_loss = tf.reduce_mean(neglogpac) KL = pi * tf.log(pi / train_model_pi) KL_loss = tf.reduce_mean(tf.reduce_sum(KL, 1)) # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") # regularization_loss = tf.contrib.layers.apply_regularization(regularizer=tf.contrib.layers.l2_regularizer(1e-4), # weights_list=params) loss = KL_loss * ent_coef # 2. Calculate the gradients grads = tf.gradients(loss, params) grads = list(zip(grads, params)) # 3. Make op for one policy and value update step of A2C # trainer = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=alpha, epsilon=epsilon) trainer = tf.train.AdamOptimizer(learning_rate=1e-3) _train = trainer.apply_gradients(grads) def train(obs, actions): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') td_map = { train_model.X: obs, OUTPUT: actions, train_model.keep_prob: 1.0 } l, _ = sess.run([loss, _train], td_map) return l self.train = train self.train_model = train_model self.act = eval_model.step self.act2 = eval_model.step2 self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def learn_ent_hoof_a2c(network, env, optimiser, seed=None, nsteps=5, total_timesteps=int(1e6), lr_upper_bound=None, ent_upper_bound=None, num_lr=None, num_ent_coeff=None, gamma=0.99, max_kl=None, max_grad_norm=0.5, log_interval=100, load_path=None, **network_kwargs): ''' Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. Parameters: ----------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int, total number of timesteps to train on (default: 80M) max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) gamma: float, reward discounting parameter (default: 0.99) log_interval: int, specifies how frequently the logs are printed out (default: 100) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # Instantiate the model object (that creates step_model and train_model) model = Ent_HOOF_Model(optimiser=optimiser, policy=policy, env=env, nsteps=nsteps, total_timesteps=total_timesteps, max_grad_norm=max_grad_norm) runner = HOOF_Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) # Calculate the batch_size nbatch = nenvs * nsteps # model helper functions model_params = find_trainable_variables("a2c_model") get_flat = U.GetFlat(model_params) set_from_flat = U.SetFromFlat(model_params) def kl(new_mean, new_sd, old_mean, old_sd): approx_kl = np.log(new_sd / old_sd) + ( old_sd**2 + (old_mean - new_mean)**2) / (2.0 * new_sd**2 + 10**-8) - 0.5 approx_kl = np.sum(approx_kl, axis=1) approx_kl = np.mean(approx_kl) return approx_kl if max_kl is None: # set max kl to a high val in case there is no constraint max_kl = 10**8 # Start total timer tstart = time.time() for update in range(1, int(total_timesteps // nbatch + 1)): opt_pol_val = -10**8 approx_kl = np.zeros((num_ent_coeff, num_lr)) epv = np.zeros((num_ent_coeff, num_lr)) rand_lr = lr_upper_bound * np.random.rand(num_lr) rand_lr = np.sort(rand_lr) rand_ent_coeff = ent_upper_bound * np.random.rand(num_ent_coeff) old_params = get_flat() rms_weights_before_upd = model.get_opt_state() obs, states, rewards, masks, actions, values, undisc_rwds, epinfos = runner.run( ) epinfobuf.extend(epinfos) old_mean, old_sd, old_neg_ll = model.get_mean_std_neg_ll(obs, actions) for nec in range(num_ent_coeff): # reset policy and rms prop optimiser set_from_flat(old_params) model.set_opt_state(rms_weights_before_upd) # get grads for loss fn with given entropy coeff policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values, rand_ent_coeff[nec]) new_params = get_flat() ent_grads = new_params - old_params # enumerate over different LR for nlr in range(num_lr): new_params = old_params + rand_lr[nlr] * ent_grads set_from_flat(new_params) new_mean, new_sd, new_neg_ll = model.get_mean_std_neg_ll( obs, actions) lik_ratio = np.exp(-new_neg_ll + old_neg_ll) est_pol_val = wis_estimate(nenvs, nsteps, undisc_rwds, lik_ratio) approx_kl[nec, nlr] = kl(new_mean, new_sd, old_mean, old_sd) epv[nec, nlr] = est_pol_val if (nec == 0 and nlr == 0) or (est_pol_val > opt_pol_val and approx_kl[nec, nlr] < max_kl): opt_pol_val = est_pol_val opt_pol_params = get_flat() opt_rms_wts = model.get_opt_state() opt_lr = rand_lr[nlr] opt_ent_coeff = rand_ent_coeff[nec] opt_kl = approx_kl[nec, nlr] # update policy and rms prop to optimal wts set_from_flat(opt_pol_params) model.set_opt_state(opt_rms_wts) # Shrink LR search space if too many get rejected rejections = np.sum(approx_kl > max_kl) / num_lr if rejections > 0.8: lr_upper_bound *= 0.8 if rejections == 0: lr_upper_bound *= 1.25 nseconds = time.time() - tstart # Calculate the fps (frame per second) fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("opt_lr", float(opt_lr)) logger.record_tabular("ent_coeff", float(opt_ent_coeff)) logger.record_tabular("approx_kl", float(opt_kl)) logger.record_tabular("rejections", rejections) logger.record_tabular("lr_ub", lr_upper_bound) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() return model
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV * logpac) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape( train_model.vf)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, PG_LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, optimiser, policy, env, nsteps, vf_coef=0.5, max_grad_norm=0.5, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6)): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs * nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) Ent_Coeff = tf.placeholder(tf.float32, []) # for Entropy # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Value loss vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy * Ent_Coeff + vf_loss * vf_coef # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C if optimiser == 'RMSProp': trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) elif optimiser == 'SGD': trainer = tf.train.GradientDescentOptimizer(learning_rate=LR) _train = trainer.apply_gradients(grads) #https://stackoverflow.com/a/45624533 _slot_vars = [ trainer.get_slot(var, name) for name in trainer.get_slot_names() for var in params ] SLOTS = [tf.placeholder(tf.float32, slot.shape) for slot in _slot_vars] _set_slots = [var.assign(SLOTS[i]) for i, var in enumerate(_slot_vars)] def get_opt_state(): return sess.run(_slot_vars) def set_opt_state(state): feed = {k: v for k, v in zip(SLOTS, state)} return sess.run(_set_slots, feed) def train(obs, states, rewards, masks, actions, values, ent_coeff): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, Ent_Coeff: ent_coeff, LR: 1.0 } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy # Only this bit added def get_mean_std_neg_ll(obs, actions): td_map = {train_model.X: obs, A: actions} vals = sess.run( [train_model.pd.mean, train_model.pd.std, neglogpac], td_map) return vals self.get_mean_std_neg_ll = get_mean_std_neg_ll self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) self.get_opt_state = get_opt_state self.set_opt_state = set_opt_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs * nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): step_model = policy(nenvs, 1, sess) train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("a2c_model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', replay_lambda=1, ss_rate=1, replay_loss=None): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs*nsteps # If we have replay_loss, create replay buffer and stage buffer # Use this to enforce replay loss lower if replay_loss is not None: self.replay_buffer = [] # holds all past data A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) # Introduce replay_loss if given if replay_loss == "L2": # Replace train_model.pi with whatever is predicted label # Replace A with whatever is recorded label re_loss = tf.nn.l2_loss(tf.nn.softmax(train_model.pi) - A) / nbatch elif replay_loss == "Distillation": # Replace y_donor with whatever is recorded label # Replace y_acceptor with whatever is predicted label re_loss = tf.reduce_mean( - tf.reduce_sum(tf.stop_gradient(y_donor) * tf.log(y_acceptor), reduction_indices=1)) loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef if replay_loss is not None: loss = loss + replay_lambda*re_loss params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs*nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Value loss vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def __init__(self, optim, policy, ob_dim, ac_dim, num_procs, max_grad_norm=0.5, lr=7e-4, vf_lr=0.001, cv_lr=0.001, cv_num=25, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', logdir=None): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) A = tf.placeholder(tf.float32, [None, ac_dim], name="A") ADV = tf.placeholder(tf.float32, [None], name="ADV") R = tf.placeholder(tf.float32, [None], name="R") train_model = policy(sess, ob_dim, ac_dim, vf_lr, cv_lr, reuse=False) step_model = policy(sess, ob_dim, ac_dim, vf_lr, cv_lr, reuse=True) params = find_trainable_variables("model") tf.summary.histogram("vf", train_model.vf) pi_params = [v for v in params if "pi" in v.name] vf_params = [v for v in params if "vf" in v.name] logpac = train_model.logprob_n vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) pg_loss = -tf.reduce_mean(ADV * logpac) tf.summary.scalar("vf_loss", vf_loss) if train_model.relaxed: ddiff_loss = tf.reduce_mean(train_model.cv) ddiff_grads_mean = tf.gradients(ddiff_loss, pi_params) ddiff_grads_std = tf.gradients(ddiff_loss, train_model.logstd_1a) dlogp_dmean = (A - train_model.mean) / tf.square( train_model.std_na) dlogp_dstd = -1 / train_model.std_na + 1 / tf.pow( train_model.std_na, 3) * tf.square(A - train_model.mean) pi_grads_mean = -((tf.expand_dims(ADV, 1) - train_model.cv) * dlogp_dmean) / tf.to_float(tf.shape(ADV)[0]) pg_grads_mean = tf.gradients(train_model.mean, pi_params, grad_ys=pi_grads_mean) pg_grads_mean = [ pg - dg for pg, dg in zip(pg_grads_mean, ddiff_grads_mean) ] pi_grads_std = -((tf.expand_dims(ADV, 1) - train_model.cv) * dlogp_dstd) / tf.to_float(tf.shape(ADV)[0]) pg_grads_std = tf.gradients(train_model.std_na, train_model.logstd_1a, grad_ys=pi_grads_std) pg_grads_std = [ pg - dg for pg, dg in zip(pg_grads_std, ddiff_grads_std) ] pg_grads = pg_grads_mean + pg_grads_std cv_loss = tf.concat([tf.reshape(p, [-1]) for p in pg_grads], 0) cv_loss = tf.squeeze(tf.reduce_sum(tf.square(cv_loss))) tf.summary.scalar("cv_loss", cv_loss) cv_params = [v for v in params if "cv" in v.name] cv_grads = tf.gradients(cv_loss, cv_params) cv_gradvars = list(zip(cv_grads, cv_params)) else: pg_grads = tf.gradients(pg_loss, pi_params) + tf.gradients( pg_loss, train_model.logstd_1a) all_policy_grads = tf.concat([tf.reshape(pg, [-1]) for pg in pg_grads], 0) # policy gradients policy_gradvars = list( zip(pg_grads, pi_params + [train_model.logstd_1a])) vf_grads = tf.gradients(vf_loss, vf_params) vf_gradvars = list(zip(vf_grads, vf_params)) grads_list = policy_gradvars + vf_gradvars if train_model.relaxed: grads_list += cv_gradvars for g, v in grads_list: tf.summary.histogram(v.name, v) tf.summary.histogram(v.name + "_grad", g) sum_op = tf.summary.merge_all() writer = tf.summary.FileWriter(logdir) trainer = optim _train = trainer.apply_gradients(policy_gradvars) _vf_train = train_model.vf_optim.apply_gradients(vf_gradvars) self._step = 0 def get_cv_grads(obs, old_actions, advs, rewards, vf_in, values): advs = rewards - values td_map = { train_model.ob_no: obs, train_model.oldac_na: old_actions, train_model.X: vf_in, A: old_actions, ADV: advs, R: rewards } cv_gs = sess.run(cv_grads, td_map) return cv_gs def update_cv(mean_cv_gs): cv_gvs = list(zip(mean_cv_gs, cv_params)) train_model.cv_optim.apply_gradients(cv_gvs) def update_policy_and_value(obs, old_actions, advs, rewards, vf_in, values, summary=False): advs = rewards - values td_map = { train_model.ob_no: obs, train_model.oldac_na: old_actions, train_model.X: vf_in, A: old_actions, ADV: advs, R: rewards } for _ in range(25): sess.run( _vf_train, { train_model.ob_no: obs, train_model.oldac_na: old_actions, train_model.X: vf_in, A: old_actions, ADV: advs, R: rewards }) if summary: sum_str, policy_loss, value_loss, _, = sess.run( [sum_op, pg_loss, vf_loss, _train], td_map) writer.add_summary(sum_str, self._step) else: policy_loss, value_loss, _ = sess.run( [pg_loss, vf_loss, _train], td_map) self._step += 1 return policy_loss, value_loss def get_grads(obs, old_actions, advs, rewards, vf_in, value): advs = rewards - value td_map = { train_model.ob_no: obs, train_model.oldac_na: old_actions, train_model.X: vf_in, A: old_actions, ADV: advs, R: rewards } _g = all_policy_grads # seems to already have happened? / tf.to_float(tf.shape(rewards)[0]) pg = sess.run(_g, td_map) return pg def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.sess = sess self.get_cv_grads = get_cv_grads self.update_cv = update_cv self.update_policy_and_value = update_policy_and_value self.train_model = train_model self.step_model = step_model self.value = train_model.value self.get_grads = get_grads self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(20e6), lrschedule='linear'): sess = tf.get_default_session() nbatch = nenvs * nsteps step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) A = train_model.pdtype.sample_placeholder([nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(train_model.pd.entropy()) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, discounted_rewards, rewards, masks, prev_actions, actions, values, dones): advs = discounted_rewards - values for step in range(len(obs)): cur_lr = lr.value() # reshape actions, rewards, and dones to have first dimension of size nenvs*nsteps, existing second dimension # this is already done for obs rews = np.reshape(rewards, (nbatch, 1)) ds = np.reshape(np.asarray(dones, dtype=np.float32), (nbatch, 1)) if len(ac_space.shape) == 0: prev_actions = np.reshape(prev_actions, (nbatch, )) one_hot = np.eye(ac_space.n)[prev_actions] for i in range(nbatch): if prev_actions[i] == -1: one_hot[i, :] = np.zeros((ac_space.n, ), dtype=np.int) x = np.concatenate((obs, one_hot, rews, ds), axis=1) actions = np.reshape(actions, (nbatch, )) else: prev_actions = np.reshape(prev_actions, (nbatch, ac_space.shape[0])) x = np.concatenate((obs, prev_actions, rews, ds), axis=1) td_map = { train_model.X: x, A: actions, ADV: advs, R: discounted_rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)