def a2c_loss(pi, vf): neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) # ent_coef_mode = hparams.get('ent_coef_mode', 'default') # ent_coef_val = hparams.get('ent_coef_val', ent_coef) # if ent_coef_mode == 'default': # actual_ent_coef = ent_coef_val # elif ent_coef_mode == 'linear_teacher': # actual_ent_coef = ent_coef_val * TEACHER_C + ent_coef * (1 - TEACHER_C) # elif ent_coef_mode == 'additive_teacher': # actual_ent_coef = ent_coef_val + ent_coef_val * TEACHER_C # else: # raise Exception('unrecognized ent_coef_mode: {}'.format(ent_coef_mode)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef return loss, pg_loss, vf_loss, entropy
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef, q_coef, e_vf_coef, gamma, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region, alpha, delta): config = tf.ConfigProto( # allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) # actions D = tf.placeholder(tf.float32, [nbatch]) # dones R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's LR = tf.placeholder(tf.float32, []) eps = 1e-6 step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True) # for explore start ================================= e_ADV = tf.placeholder(tf.float32, [nbatch]) e_R = tf.placeholder(tf.float32, [nbatch]) e_pi_logits, e_v = map(lambda var: strip(var, nenvs, nsteps), [train_model.e_pi_logits, train_model.e_v]) e_neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=e_pi_logits, labels=A) e_pg_loss = tf.reduce_mean(e_ADV * e_neglogpac) e_vf_loss = tf.reduce_mean(mse(tf.squeeze(e_v), e_R)) # entropy = tf.reduce_mean(cat_entropy(train_model.pi)) e_loss = e_pg_loss + e_vf_loss * e_vf_coef # e_params = find_trainable_variables("model/explore") with tf.variable_scope('model'): e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/explore') e_grads = tf.gradients(e_loss, e_params) if max_grad_norm is not None: e_grads, e_grad_norm = tf.clip_by_global_norm( e_grads, max_grad_norm) # for explore end ================================= # params = find_trainable_variables("model/acer") with tf.variable_scope('model'): params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/acer') print("Params {}".format(len(params))) for var in params: print(var) # create polyak averaged model ema = tf.train.ExponentialMovingAverage(alpha) ema_apply_op = ema.apply(params) def custom_getter(getter, *args, **kwargs): v0 = getter(*args, **kwargs) v = ema.average(v0) # v = ema.average(getter(*args, **kwargs)) if v is None: return v0 else: print(v.name) return v with tf.variable_scope("", custom_getter=custom_getter, reuse=True): polyak_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True) # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i v = tf.reduce_sum(train_model.pi * train_model.q, axis=-1) # shape is [nenvs * (nsteps + 1)] # strip off last step f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model.pi, polyak_model.pi, train_model.q]) # Get pi and q values for actions taken f_i = get_by_index(f, A) q_i = get_by_index(q, A) # Compute ratios for importance truncation rho = f / (MU + eps) rho_i = get_by_index(rho, A) # Calculate Q_retrace targets qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma) # Calculate losses # Entropy entropy = tf.reduce_mean(cat_entropy_softmax(f)) # Policy Graident loss, with truncated importance sampling & bias correction v = strip(v, nenvs, nsteps, True) check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4) check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2) # Truncated importance sampling adv = qret - v logf = tf.log(f_i + eps) gain_f = logf * tf.stop_gradient( adv * tf.minimum(c, rho_i)) # [nenvs * nsteps] loss_f = -tf.reduce_mean(gain_f) # Bias correction for the truncation adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]) ) # [nenvs * nsteps, nact] logf_bc = tf.log(f + eps) # / (f_old + eps) check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2) gain_bc = tf.reduce_sum( logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis=1) #IMP: This is sum, as expectation wrt f loss_bc = -tf.reduce_mean(gain_bc) loss_policy = loss_f + loss_bc # Value/Q function loss, and explained variance check_shape([qret, q_i], [[nenvs * nsteps]] * 2) ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps])) loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5) # Net loss check_shape([loss_policy, loss_q, entropy], [[]] * 3) # loss = loss_policy + q_coef * loss_q - ent_coef * entropy loss = loss_policy + q_coef * loss_q + e_loss if trust_region: g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact] # k = tf.gradients(KL(f_pol || f), f) k = -f_pol / ( f + eps ) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f k_dot_g = tf.reduce_sum(k * g, axis=-1) adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps] # Calculate stats (before doing adjustment) for logging. avg_norm_k = avg_norm(k) avg_norm_g = avg_norm(g) avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g)) avg_norm_adj = tf.reduce_mean(tf.abs(adj)) g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k grads_f = -g / ( nenvs * nsteps ) # These are turst region adjusted gradients wrt f ie statistics of policy pi grads_policy = tf.gradients(f, params, grads_f) grads_q = tf.gradients(loss_q * q_coef, params) grads = [ gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params) ] avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs) norm_grads_q = tf.global_norm(grads_q) norm_grads_policy = tf.global_norm(grads_policy) else: grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm) # add explore grads grads.extend(e_grads) params.extend(e_params) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon) _opt_op = trainer.apply_gradients(grads) # so when you call _train, you first do the gradient step, then you apply ema with tf.control_dependencies([_opt_op]): _train = tf.group(ema_apply_op) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) # Ops/Summaries to run, and their names for logging run_ops = [ _train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ] names_ops = [ 'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance', 'norm_grads' ] if trust_region: run_ops = run_ops + [ norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g, avg_norm_adj, e_pg_loss, e_vf_loss ] names_ops = names_ops + [ 'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj', 'e_pg_loss', 'e_vf_loss' ] def train(obs, actions, rewards, dones, mus, states, masks, steps, e_returns, e_advs): cur_lr = lr.value_steps(steps) td_map = { train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr, e_R: e_returns, e_ADV: e_advs } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks td_map[polyak_model.S] = states td_map[polyak_model.M] = masks return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) self.train = train self.save = save self.train_model = train_model self.step_model = step_model self.step = step_model.step self.e_step = step_model.e_step self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', param=None): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False, param=param) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True, param=param) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, v_mix_coef=0.5, max_grad_norm=0.5, lr_alpha=7e-4, lr_beta=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', r_ex_coef=1.0, r_in_coef=0.0, v_ex_coef=1.0): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch], 'A') R_EX = tf.placeholder(tf.float32, [nbatch], 'R_EX') ADV_EX = tf.placeholder(tf.float32, [nbatch], 'ADV_EX') RET_EX = tf.placeholder(tf.float32, [nbatch], 'RET_EX') V_MIX = tf.placeholder(tf.float32, [nbatch], 'V_MIX') DIS_V_MIX_LAST = tf.placeholder(tf.float32, [nbatch], 'DIS_V_MIX_LAST') COEF_MAT = tf.placeholder(tf.float32, [nbatch, nbatch], 'COEF_MAT') LR_ALPHA = tf.placeholder(tf.float32, [], 'LR_ALPHA') LR_BETA = tf.placeholder(tf.float32, [], 'LR_BETA') step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) r_mix = r_ex_coef * R_EX + r_in_coef * tf.reduce_sum( train_model.r_in * tf.one_hot(A, nact), axis=1) ret_mix = tf.squeeze( tf.matmul(COEF_MAT, tf.reshape(r_mix, [nbatch, 1])), [1]) + DIS_V_MIX_LAST adv_mix = ret_mix - V_MIX neglogpac = train_model.pd.neglogp(A) pg_mix_loss = tf.reduce_mean(adv_mix * neglogpac) v_mix_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_mix), ret_mix)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) policy_loss = pg_mix_loss - ent_coef * entropy + v_mix_coef * v_mix_loss policy_params = train_model.policy_params policy_grads = tf.gradients(policy_loss, policy_params) if max_grad_norm is not None: policy_grads, policy_grad_norm = tf.clip_by_global_norm( policy_grads, max_grad_norm) policy_grads_and_vars = list(zip(policy_grads, policy_params)) policy_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_ALPHA, decay=alpha, epsilon=epsilon) policy_train = policy_trainer.apply_gradients(policy_grads_and_vars) rmss = [policy_trainer.get_slot(var, 'rms') for var in policy_params] policy_params_new = {} for grad, rms, var in zip(policy_grads, rmss, policy_params): ms = rms + (tf.square(grad) - rms) * (1 - alpha) policy_params_new[ var.name] = var - LR_ALPHA * grad / tf.sqrt(ms + epsilon) policy_new = train_model.policy_new_fn(policy_params_new, ob_space, ac_space, nbatch, nsteps) neglogpac_new = policy_new.pd.neglogp(A) ratio_new = tf.exp(tf.stop_gradient(neglogpac) - neglogpac_new) pg_ex_loss = tf.reduce_mean(-ADV_EX * ratio_new) v_ex_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_ex), RET_EX)) intrinsic_loss = pg_ex_loss + v_ex_coef * v_ex_loss intrinsic_params = train_model.intrinsic_params intrinsic_grads = tf.gradients(intrinsic_loss, intrinsic_params) if max_grad_norm is not None: intrinsic_grads, intrinsic_grad_norm = tf.clip_by_global_norm( intrinsic_grads, max_grad_norm) intrinsic_grads_and_vars = list(zip(intrinsic_grads, intrinsic_params)) intrinsic_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_BETA, decay=alpha, epsilon=epsilon) intrinsic_train = intrinsic_trainer.apply_gradients( intrinsic_grads_and_vars) lr_alpha = Scheduler(v=lr_alpha, nvalues=total_timesteps, schedule=lrschedule) lr_beta = Scheduler(v=lr_beta, nvalues=total_timesteps, schedule=lrschedule) all_params = tf.global_variables() def train(obs, policy_states, masks, actions, r_ex, ret_ex, v_ex, v_mix, dis_v_mix_last, coef_mat): advs_ex = ret_ex - v_ex for step in range(len(obs)): cur_lr_alpha = lr_alpha.value() cur_lr_beta = lr_beta.value() td_map = { train_model.X: obs, policy_new.X: obs, A: actions, R_EX: r_ex, ADV_EX: advs_ex, RET_EX: ret_ex, V_MIX: v_mix, DIS_V_MIX_LAST: dis_v_mix_last, COEF_MAT: coef_mat, LR_ALPHA: cur_lr_alpha, LR_BETA: cur_lr_beta } if policy_states is not None: td_map[train_model.PS] = policy_states td_map[train_model.M] = masks return sess.run([entropy, policy_train, intrinsic_train], td_map)[0] def save(save_path): ps = sess.run(all_params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(all_params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.intrinsic_reward = step_model.intrinsic_reward self.init_policy_state = step_model.init_policy_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', logdir=None): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs*nsteps ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=train_model.a0) entropy = tf.reduce_sum(cat_entropy(train_model.pi)) params = find_trainable_variables("model") tf.summary.histogram("vf", train_model.vf) tf.summary.histogram("R", R) if train_model.relaxed: pg_loss = tf.constant(0.0) oh_A = tf.one_hot(train_model.a0, ac_space.n) params = find_trainable_variables("model") policy_params = [v for v in params if "pi" in v.name] vf_params = [v for v in params if "vf" in v.name] entropy_grads = tf.gradients(entropy, policy_params) ddiff_loss = tf.reduce_sum(train_model.vf - train_model.vf_t) ddiff_grads = tf.gradients(ddiff_loss, policy_params) sm = tf.nn.softmax(train_model.pi) dlogp_dpi = oh_A * (1. - sm) + (1. - oh_A) * (-sm) pi_grads = -((tf.expand_dims(R, 1) - train_model.vf_t) * dlogp_dpi) pg_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads) pg_grads = [pg - dg for pg, dg in zip(pg_grads, ddiff_grads)] pi_param_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads) cv_grads = tf.concat([tf.reshape(p, [-1]) for p in pg_grads], 0) cv_grad_splits = tf.reduce_sum(tf.square(cv_grads)) vf_loss = cv_grad_splits * vf_coef cv_grads = tf.gradients(vf_loss, vf_params) policy_grads = [] for e_grad, p_grad, param in zip(entropy_grads, pg_grads, policy_params): grad = -e_grad * ent_coef + p_grad policy_grads.append(grad) grad_dict = {} for g, v in list(zip(policy_grads, policy_params))+list(zip(cv_grads, vf_params)): grad_dict[v] = g grads = [grad_dict[v] for v in params] print(grads) else: pg_loss = tf.reduce_sum((tf.stop_gradient(R) - tf.stop_gradient(train_model.vf)) * neglogpac) policy_params = [v for v in params if "pi" in v.name] pg_grads = tf.gradients(pg_loss, policy_params) vf_loss = tf.reduce_sum(mse(tf.squeeze(train_model.vf), R)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef grads = tf.gradients(loss, params) grads = list(zip(grads, params)) ema = tf.train.ExponentialMovingAverage(.99) all_policy_grads = tf.concat([tf.reshape(g, [-1]) for g in pg_grads], 0) all_policy_grads_sq = tf.square(all_policy_grads) apply_mean_op = ema.apply([all_policy_grads, all_policy_grads_sq]) em_mean = ema.average(all_policy_grads) em_mean_sq = ema.average(all_policy_grads_sq) em_var = em_mean_sq - tf.square(em_mean) em_log_var = tf.log(em_var + 1e-20) mlgv = tf.reduce_mean(em_log_var) for g, v in grads: print(v.name, g) tf.summary.histogram(v.name, v) tf.summary.histogram(v.name+"_grad", g) self.sum_op = tf.summary.merge_all() self.writer = tf.summary.FileWriter(logdir) trainer = tf.train.AdamOptimizer(learning_rate=LR, beta2=.99999) with tf.control_dependencies([apply_mean_op]): _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) self._step = 0 def train(obs, states, rewards, masks, u1, u2, values, summary=False): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X:obs, train_model.U1:u1, train_model.U2:u2, ADV:advs, R:rewards, LR:cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks if summary: sum_str, policy_loss, value_loss, policy_entropy, lv, _ = sess.run( [self.sum_op, pg_loss, vf_loss, entropy, mlgv, _train], td_map ) self.writer.add_summary(sum_str, self._step) else: policy_loss, value_loss, policy_entropy, lv, _ = sess.run( [pg_loss, vf_loss, entropy, mlgv, _train], td_map ) self._step += 1 return policy_loss, value_loss, policy_entropy, lv def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, optim, policy, ob_dim, ac_dim, num_procs, max_grad_norm=0.5, lr=7e-4, vf_lr=0.001, cv_lr=0.001, cv_num=25, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', logdir=None): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) A = tf.placeholder(tf.float32, [None, ac_dim], name="A") ADV = tf.placeholder(tf.float32, [None], name="ADV") R = tf.placeholder(tf.float32, [None], name="R") train_model = policy(sess, ob_dim, ac_dim, vf_lr, cv_lr, reuse=False) step_model = policy(sess, ob_dim, ac_dim, vf_lr, cv_lr, reuse=True) params = find_trainable_variables("model") tf.summary.histogram("vf", train_model.vf) pi_params = [v for v in params if "pi" in v.name] vf_params = [v for v in params if "vf" in v.name] logpac = train_model.logprob_n vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) pg_loss = -tf.reduce_mean(ADV * logpac) tf.summary.scalar("vf_loss", vf_loss) if train_model.relaxed: ddiff_loss = tf.reduce_mean(train_model.cv) ddiff_grads_mean = tf.gradients(ddiff_loss, pi_params) ddiff_grads_std = tf.gradients(ddiff_loss, train_model.logstd_1a) dlogp_dmean = (A - train_model.mean) / tf.square( train_model.std_na) dlogp_dstd = -1 / train_model.std_na + 1 / tf.pow( train_model.std_na, 3) * tf.square(A - train_model.mean) pi_grads_mean = -((tf.expand_dims(ADV, 1) - train_model.cv) * dlogp_dmean) / tf.to_float(tf.shape(ADV)[0]) pg_grads_mean = tf.gradients(train_model.mean, pi_params, grad_ys=pi_grads_mean) pg_grads_mean = [ pg - dg for pg, dg in zip(pg_grads_mean, ddiff_grads_mean) ] pi_grads_std = -((tf.expand_dims(ADV, 1) - train_model.cv) * dlogp_dstd) / tf.to_float(tf.shape(ADV)[0]) pg_grads_std = tf.gradients(train_model.std_na, train_model.logstd_1a, grad_ys=pi_grads_std) pg_grads_std = [ pg - dg for pg, dg in zip(pg_grads_std, ddiff_grads_std) ] pg_grads = pg_grads_mean + pg_grads_std cv_loss = tf.concat([tf.reshape(p, [-1]) for p in pg_grads], 0) cv_loss = tf.squeeze(tf.reduce_sum(tf.square(cv_loss))) tf.summary.scalar("cv_loss", cv_loss) cv_params = [v for v in params if "cv" in v.name] cv_grads = tf.gradients(cv_loss, cv_params) cv_gradvars = list(zip(cv_grads, cv_params)) else: pg_grads = tf.gradients(pg_loss, pi_params) + tf.gradients( pg_loss, train_model.logstd_1a) all_policy_grads = tf.concat([tf.reshape(pg, [-1]) for pg in pg_grads], 0) # policy gradients policy_gradvars = list( zip(pg_grads, pi_params + [train_model.logstd_1a])) vf_grads = tf.gradients(vf_loss, vf_params) vf_gradvars = list(zip(vf_grads, vf_params)) grads_list = policy_gradvars + vf_gradvars if train_model.relaxed: grads_list += cv_gradvars for g, v in grads_list: tf.summary.histogram(v.name, v) tf.summary.histogram(v.name + "_grad", g) sum_op = tf.summary.merge_all() writer = tf.summary.FileWriter(logdir) trainer = optim _train = trainer.apply_gradients(policy_gradvars) _vf_train = train_model.vf_optim.apply_gradients(vf_gradvars) self._step = 0 def get_cv_grads(obs, old_actions, advs, rewards, vf_in, values): advs = rewards - values td_map = { train_model.ob_no: obs, train_model.oldac_na: old_actions, train_model.X: vf_in, A: old_actions, ADV: advs, R: rewards } cv_gs = sess.run(cv_grads, td_map) return cv_gs def update_cv(mean_cv_gs): cv_gvs = list(zip(mean_cv_gs, cv_params)) train_model.cv_optim.apply_gradients(cv_gvs) def update_policy_and_value(obs, old_actions, advs, rewards, vf_in, values, summary=False): advs = rewards - values td_map = { train_model.ob_no: obs, train_model.oldac_na: old_actions, train_model.X: vf_in, A: old_actions, ADV: advs, R: rewards } for _ in range(25): sess.run( _vf_train, { train_model.ob_no: obs, train_model.oldac_na: old_actions, train_model.X: vf_in, A: old_actions, ADV: advs, R: rewards }) if summary: sum_str, policy_loss, value_loss, _, = sess.run( [sum_op, pg_loss, vf_loss, _train], td_map) writer.add_summary(sum_str, self._step) else: policy_loss, value_loss, _ = sess.run( [pg_loss, vf_loss, _train], td_map) self._step += 1 return policy_loss, value_loss def get_grads(obs, old_actions, advs, rewards, vf_in, value): advs = rewards - value td_map = { train_model.ob_no: obs, train_model.oldac_na: old_actions, train_model.X: vf_in, A: old_actions, ADV: advs, R: rewards } _g = all_policy_grads # seems to already have happened? / tf.to_float(tf.shape(rewards)[0]) pg = sess.run(_g, td_map) return pg def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.sess = sess self.get_cv_grads = get_cv_grads self.update_cv = update_cv self.update_policy_and_value = update_policy_and_value self.train_model = train_model self.step_model = step_model self.value = train_model.value self.get_grads = get_grads self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, n_envs, total_timesteps, nprocs=32, n_steps=20, ent_coef=0.01, vf_coef=0.25, vf_fisher_coef=1.0, learning_rate=0.25, max_grad_norm=0.5, kfac_clip=0.001, lr_schedule='linear'): """ The ACKTR (Actor Critic using Kronecker-Factored Trust Region) model class, https://arxiv.org/abs/1708.05144 :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param ob_space: (Gym Space) The observation space :param ac_space: (Gym Space) The action space :param n_envs: (int) The number of environments :param total_timesteps: (int) The total number of timesteps for training the model :param nprocs: (int) The number of threads for TensorFlow operations :param n_steps: (int) The number of steps to run for each environment :param ent_coef: (float) The weight for the entropic loss :param vf_coef: (float) The weight for the loss on the value function :param vf_fisher_coef: (float) The weight for the fisher loss on the value function :param learning_rate: (float) The initial learning rate for the RMS prop optimizer :param max_grad_norm: (float) The clipping value for the maximum gradient :param kfac_clip: (float) gradient clipping for Kullback leiber :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') """ config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) n_batch = n_envs * n_steps action_ph = tf.placeholder(tf.int32, [n_batch]) advs_ph = tf.placeholder(tf.float32, [n_batch]) rewards_ph = tf.placeholder(tf.float32, [n_batch]) pg_lr_ph = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, n_envs, 1, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, n_envs * n_steps, n_steps, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.policy, labels=action_ph) self.logits = train_model.policy # training loss pg_loss = tf.reduce_mean(advs_ph * logpac) entropy = tf.reduce_mean(calc_entropy(train_model.policy)) pg_loss = pg_loss - ent_coef * entropy vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph) train_loss = pg_loss + vf_coef * vf_loss # Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.value_fn + tf.random_normal( tf.shape(train_model.value_fn)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2)) self.joint_fisher = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer( learning_rate=pg_lr_ph, clip_kl=kfac_clip, momentum=0.9, kfac_update=1, epsilon=0.01, stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) optim.compute_and_apply_stats(self.joint_fisher, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.learning_rate = Scheduler(initial_value=learning_rate, n_values=total_timesteps, schedule=lr_schedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for _ in range(len(obs)): cur_lr = self.learning_rate.value() td_map = { train_model.obs_ph: obs, action_ph: actions, advs_ph: advs, rewards_ph: rewards, pg_lr_ph: cur_lr } if states is not None: td_map[train_model.states_ph] = states td_map[train_model.masks_ph] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): session_params = sess.run(params) joblib.dump(session_params, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for param, loaded_p in zip(params, loaded_params): restores.append(param.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, lambda_dist=0.01, total_timesteps=None, lrschedule='linear'): sess = tf.get_default_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) config = Config() act_model = policy(config) config.reuse = True train_model = policy(config) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.logits, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.logits)) aux_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.rp_logits, labels=A) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + aux_loss * lambda_dist params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) saver = tf.train.Saver() def train(obs, rs, rr, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr, train_model.inputs_s: rs, train_model.inputs_r: rr } policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): saver.save(sess, save_path + 'model.ckpt') def load(load_path): saver.restore(sess, load_path + 'model.ckpt') self.train = train self.train_model = train_model self.act_model = act_model self.act = act_model.act self.value = act_model.value self.save = save self.load = load
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', nModelsToKeep=5): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(): modelfile = os.path.join( logger.get_dir(), datetime.datetime.now().strftime("model-%Y-%m-%d-%H-%M-%S-%f")) ps = sess.run(params) joblib.dump(ps, modelfile) logger.log('Model saved to %s' % modelfile) model_files = sorted( fnmatch.filter(os.listdir(logger.get_dir()), "model-*")) if len(model_files) > nModelsToKeep: for old_file in model_files[0:-nModelsToKeep]: os.remove(os.path.join(logger.get_dir(), old_file)) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) logger.log('Model loaded from %s' % load_path) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, n_envs, n_steps, ent_coef=0.01, vf_coef=0.25, max_grad_norm=0.5, learning_rate=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lr_schedule='linear'): """ The A2C (Advantage Actor Critic) model class, https://arxiv.org/abs/1602.01783 :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param ob_space: (Gym Space) Observation space :param ac_space: (Gym Space) Action space :param n_envs: (int) The number of environments :param n_steps: (int) The number of steps to run for each environment :param ent_coef: (float) Entropy coefficient for the loss caculation :param vf_coef: (float) Value function coefficient for the loss calculation :param max_grad_norm: (float) The maximum value for the gradient clipping :param learning_rate: (float) The learning rate :param alpha: (float) RMS prop optimizer decay :param epsilon: (float) RMS prop optimizer epsilon :param total_timesteps: (int) The total number of samples :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') """ sess = tf_util.make_session() n_batch = n_envs * n_steps actions_ph = tf.placeholder(tf.int32, [n_batch]) advs_ph = tf.placeholder(tf.float32, [n_batch]) rewards_ph = tf.placeholder(tf.float32, [n_batch]) learning_rate_ph = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, n_envs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, n_envs * n_steps, n_steps, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.policy, labels=actions_ph) pg_loss = tf.reduce_mean(advs_ph * neglogpac) vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph) entropy = tf.reduce_mean(calc_entropy(train_model.policy)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=learning_rate_ph, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) learning_rate = Scheduler(initial_value=learning_rate, n_values=total_timesteps, schedule=lr_schedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for _ in range(len(obs)): cur_lr = learning_rate.value() td_map = { train_model.obs_ph: obs, actions_ph: actions, advs_ph: advs, rewards_ph: rewards, learning_rate_ph: cur_lr } if states is not None: td_map[train_model.states_ph] = states td_map[train_model.masks_ph] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): parameters = sess.run(params) make_path(os.path.dirname(save_path)) joblib.dump(parameters, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for param, loaded_p in zip(params, loaded_params): restores.append(param.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV*logpac) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params=params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss,params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads,params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, action_space, nenvs, nsteps, ent_coeff, vf_coeff, max_grad_norm): sess = tf.get_default_session() #Define placeholders actions_ = tf.placeholder(tf.int32, [None], name="actions_") advantages_ = tf.placeholder(tf.float32, [None], name="advantages_") rewards_ = tf.placeholder(tf.float32, [None], name="rewards_") lr_ = tf.placeholder(tf.float32, name="learning_rate_") #Create our two models here #take one step for each environment step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False) #take number of steps * number of environments for total steps train_model = policy(sess, ob_space, action_space, nenvs * nsteps, nsteps, reuse=True) #calculate the loss #Note: in the future we can add clipped Loss to control the step size of our parameter updates. #This can lead to better convergence *Using PPO* #Recall that Total Loss = PolicyGradientLoss - Entropy*EntropyCoeff + Value*ValueCoeff #output loss -log(policy) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( Logits=train_model.pi, Labels=actions_, ) #1/n * sum(A(s,a) * -logpi(a|s)) pg_loss = tf.reduce_mean(advantages_ * neglogpac) #value loss vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), rewards_)) #entropy entropy = tf.reduce_mean(train_model.pd.entropy()) #total loss loss = pg_loss - (entropy * ent_coeff) + (vf_loss * vf_coeff) #Update the parameters using the loss we've just calculated #Grab model params params = find_trainable_variables("model") #Calculate gradients. *We'll want to zip our parameters w/ our gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: #Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = List(zip(grads, params)) #build our trainer trainer = tf.train.RMSPropOptimizer(Learning_rate=lr_, decay=0.99, epsilon=1e-5) #Backprop _train = trainer.apply_gradients(grads) def train(states_in, actions, returns, values, lr): #here we calculate advantage A(s, a) = R+yV(s') - V(s) #Returns = R+yV(S') advantages = returns - values td_map = { train_model.inputs_: states_in, actions_: actions, advantages_: advantages, rewards_: returns, #Recall we bootstrap "real" value since we're learning 1 step at a time. (not episode) lr_: lr } policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): saver = tf.train.Saver() saver.save(sess, save_path) def load(load_path): saver = tf.train.Saver() saver.restore(sess, load_path) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', continuous_actions=False, debug=False, numAgents=2, itr=1, particleEnv=False, communication=False): self.continuous_actions = continuous_actions self.nenvs = nenvs print('vf_coef', vf_coef) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # print('action space: ', ac_space) if particleEnv == False: nact = ac_space.n elif communication == False: nact = ac_space[itr].n else: nact = ac_space[itr].high - ac_space[itr].low # modified self.nact = nact # print('nact: ', nact) # print(nact) nbatch = nenvs * nsteps # print(nbatch) # print('batch size: ', nbatch) if self.continuous_actions: A = tf.placeholder(tf.float32, [nbatch]) elif particleEnv == False or communication == False: A = tf.placeholder(tf.int32, [nbatch]) else: actions_per_agent = 2 A = tf.placeholder(tf.int32, [actions_per_agent, nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) if particleEnv == False: step_model = policy( sess, ob_space, ac_space, nenvs, 1, nstack, reuse=tf.AUTO_REUSE, continuous_actions=continuous_actions) #, itr=itr) train_model = policy( sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=tf.AUTO_REUSE, continuous_actions=continuous_actions) #, itr=itr) elif communication == False: # print('step model') step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False, continuous_actions=continuous_actions, itr=itr, communication=communication) # print('train model') train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=tf.AUTO_REUSE, continuous_actions=continuous_actions, itr=itr, communication=communication) else: step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=tf.AUTO_REUSE, continuous_actions=continuous_actions, itr=itr, communication=communication) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=tf.AUTO_REUSE, continuous_actions=continuous_actions, itr=itr, communication=communication) # else: # else: # step_model = [] # train_model = [] # for i in range(numAgents): # step_model.append(policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=tf.AUTO_REUSE, continuous_actions=continuous_actions)) # train_model.append(policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True, continuous_actions=continuous_actions)) # print(train_model) if self.continuous_actions: neglogpac = tf.log(mse(train_model.mu, A)) elif particleEnv == False or communication == False: # print('A: ', A) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = tf.reduce_mean(ADV * neglogpac) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef else: neglogpac = [] entropy = [] pg_loss = [] loss = [] vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) neglogpac_ = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi_c, labels=A[0]) entropy_ = tf.reduce_mean(cat_entropy(train_model.pi_c)) pg_loss_ = tf.reduce_mean(ADV * neglogpac_) entropy.append(entropy_) pg_loss.append(pg_loss_) loss.append(pg_loss_ - entropy_ * ent_coef + vf_loss * vf_coef) neglogpac_ = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi_u, labels=A[1]) entropy_ = tf.reduce_mean(cat_entropy(train_model.pi_u)) pg_loss_ = tf.reduce_mean(ADV * neglogpac_) entropy.append(entropy_) pg_loss.append(pg_loss_) loss.append(pg_loss_ - entropy_ * ent_coef + vf_loss * vf_coef) params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # f itr == 0: # trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = tf.train.AdamOptimizer( learning_rate=LR, name=str(itr) ).apply_gradients( grads ) # , decay=alpha, epsilon=epsilon, name=str(itr)).apply_gradients(grads) # _train = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon, name=str(itr)).apply_gradients(grads) # Error here lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values, debug=False, numAgents=2): # print('train rewards and values') # print(actions[0]) # print(actions[1]) # print(rewards) # print(values) advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } # if states != []: if train_model.initial_state != []: # print(states) td_map[train_model.S] = states td_map[train_model.M] = masks if debug == True: policy_loss, value_loss, policy_entropy, all_grad_vals, _ = sess.run( [pg_loss, vf_loss, entropy, grads, _train], td_map) # grad_vals = [(np.min(grad_vals), np.max(grad_vals), np.sum(grad_vals)) for grad_vals in all_grad_vals] # print('Policy Gradients: ') # print(all_grad_vals[9]) # print('Value Gradients: ') # print(all_grad_vals[11]) print('Gradient Values: ') print(all_grad_vals) else: policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) # else: # td_map = [] # print('Train Model in train') # print(train_model) # for i in range(numAgents): # td_map = {train_model[i].X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} # if train_model[i].initial_state != []: # print('states') # print(states) # td_map[train_model[i].S] = states # td_map[train_model[i].M] = masks # if debug: # print('point1') # policy_loss, value_loss, policy_entropy, all_grad_vals, _ = sess.run( # [pg_loss, vf_loss, entropy, grads, _train], # td_map # ) # print('point2') # grad_vals = [(np.min(grad_vals), np.max(grad_vals), np.sum(grad_vals)) for grad_vals in all_grad_vals] # print('Policy Gradients: ') # print(all_grad_vals[9]) # print('Value Gradients: ') # print(all_grad_vals[11]) # else: # policy_loss, value_loss, policy_entropy, _ = sess.run( # [pg_loss, vf_loss, entropy, _train], # td_map # ) # print('Policy Loss: ') # print(policy_loss) # print('Value Loss: ') # print(value_loss) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) #make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model # if numAgents == 1: self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state # else: # self.step = [] # self.value = [] # self.initial_state = [] # for i in range(numAgents): # self.step.append(step_model[i].step) # self.value.append(step_model[i].value) # self.initial_state.append(step_model[i].initial_state) self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nscripts=16, nsteps=20, nstack=4, ent_coef=0.1, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.001, kfac_clip=0.001, lrschedule='linear', alpha=0.99, epsilon=1e-5): config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nsml.bind(sess=sess) #nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) XY0 = tf.placeholder(tf.int32, [nbatch]) XY1 = tf.placeholder(tf.int32, [nbatch]) # ADV == TD_TARGET - values ADV = tf.placeholder(tf.float32, [nbatch]) TD_TARGET = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy( sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model2 = train_model = policy( sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) # Policy 1 : Base Action : train_model.pi label = A script_mask = tf.concat( [ tf.zeros([nscripts * nsteps, 1]), tf.ones([(nprocs - nscripts) * nsteps, 1]) ], axis=0) pi = train_model.pi pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0 pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi, labels=A) neglogpac *= tf.stop_gradient(pac_weight) inv_A = 1.0 - tf.cast(A, tf.float32) xy0_mask = tf.cast(A, tf.float32) xy1_mask = tf.cast(A, tf.float32) condition0 = tf.equal(xy0_mask, 2) xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask) xy0_mask = 1.0 - xy0_mask condition1 = tf.equal(xy1_mask, 2) xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask) # One hot representation of chosen marine. # [batch_size, 2] pi_xy0 = train_model.pi_xy0 pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0 pac_weight = tf.reduce_sum( pac_weight * tf.one_hot(XY0, depth=1024), axis=1) logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy0, labels=XY0) logpac_xy0 *= tf.stop_gradient(pac_weight) logpac_xy0 *= tf.cast(xy0_mask, tf.float32) pi_xy1 = train_model.pi_xy1 pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0 pac_weight = tf.reduce_sum( pac_weight * tf.one_hot(XY0, depth=1024), axis=1) # 1D? 2D? logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy1, labels=XY1) logpac_xy1 *= tf.stop_gradient(pac_weight) logpac_xy1 *= tf.cast(xy1_mask, tf.float32) pg_loss = tf.reduce_mean(ADV * neglogpac) pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0) pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1) vf_ = tf.squeeze(train_model.vf) vf_r = tf.concat( [ tf.ones([nscripts * nsteps, 1]), tf.zeros([(nprocs - nscripts) * nsteps, 1]) ], axis=0) * TD_TARGET vf_masked = vf_ * script_mask + vf_r #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps] vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET)) entropy_a = tf.reduce_mean(cat_entropy(train_model.pi)) entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0)) entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1)) entropy = entropy_a + entropy_xy0 + entropy_xy1 loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) self.logits = logits = train_model.pi # xy0 self.params_common = params_common = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') self.params_xy0 = params_xy0 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy0 = grads_xy0 = tf.gradients( train_loss_xy0, params_xy0) if max_grad_norm is not None: grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm) grads_xy0 = list(zip(grads_xy0, params_xy0)) trainer_xy0 = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy0 = trainer_xy0.apply_gradients(grads_xy0) # xy1 self.params_xy1 = params_xy1 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy1 = grads_xy1 = tf.gradients( train_loss_xy1, params_xy1) if max_grad_norm is not None: grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm) grads_xy1 = list(zip(grads_xy1, params_xy1)) trainer_xy1 = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy1 = trainer_xy1.apply_gradients(grads_xy1) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, td_targets, masks, actions, xy0, xy1, values): advs = td_targets - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, XY0: xy0, XY1: xy1, ADV: advs, TD_TARGET: td_targets, PG_LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _, \ policy_loss_xy0, policy_entropy_xy0, _, \ policy_loss_xy1, policy_entropy_xy1, _ = sess.run( [pg_loss, vf_loss, entropy, _train, pg_loss_xy0, entropy_xy0, _train_xy0, pg_loss_xy1, entropy_xy1, _train_xy1], td_map) return policy_loss, value_loss, policy_entropy, \ policy_loss_xy0, policy_entropy_xy0, \ policy_loss_xy1, policy_entropy_xy1 def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state print("global_variables_initializer start") tf.global_variables_initializer().run(session=sess) print("global_variables_initializer complete")
def __init__(self, policy, ob_space, action_space, nenvs, nsteps, ent_coef, vf_coef, max_grad_norm): sess = tf.get_default_session() actions_ = tf.placeholder(tf.int32, [None], name="actions_") advantages_ = tf.placeholder(tf.int32, [None], name="advantages_") rewards_ = tf.placeholder(tf.int32, [None], name="rewards_") lr_ = tf.placeholder(tf.int32, [None], name="learning_rate_") step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, action_space, nenvs * nsteps, nsteps, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=actions_) pg_loss = tf.reduce_mean(advantages_ * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squezze(train_model.vf), rewards_)) entropy = tf.reduce_mean(train_model.pd.entropy()) loss = pgloss - entropy * ent_coef + vf_loss * vf_coef params = find_tranaible_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grands, grand_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate_, decay=0.99, epsilon=1e-5) _train = trainer.apply_gradient(grads) def train(states_in, actions, returns, values, lr): advantages = returns - values td_map = { train_model.inputs_: states_in, actions_: actions, advantages_: advantages, rewards_: returns, lr_: lr } policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): saver = tf.train.Saver() saver.save(sess, save_path) def load(load_path): saver = tf.train.Saver() print("Loading " + load_path) saver.restore(sess, load_path) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, master_ts = 1, worker_ts = 30, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, cell = 256, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', algo='regular', beta=1e-3): print('Create Session') gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) nact = ac_space.n nbatch = nenvs*master_ts*worker_ts A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, 1, cell = cell, model='step_model', algo=algo) train_model = policy(sess, ob_space, ac_space, nbatch, master_ts, worker_ts, model='train_model', algo=algo) print('model_setting_done') #loss construction neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.wvf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.wpi)) pg_loss = pg_loss - entropy * ent_coef print('algo: ', algo, 'max_grad_norm: ', str(max_grad_norm)) try: if algo == 'regular': loss = pg_loss + vf_coef * vf_loss elif algo == 'VIB': ''' implement VIB here, apart from the vf_loss and pg_loss, there should be a third loss, the kl_loss = ds.kl_divergence(model.encoding, prior), where prior is a Gaussian distribution with mu=0, std=1 the final loss should be pg_loss + vf_coef * vf_loss + beta*kl_loss ''' prior = ds.Normal(0.0, 1.0) kl_loss = tf.reduce_mean(ds.kl_divergence(train_model.encoding, prior)) loss = pg_loss + vf_coef * vf_loss + beta*kl_loss # pass else: raise Exception('Algorithm not exists') except Exception as e: print(e) grads, global_norm = grad_clip(loss, max_grad_norm, ['model']) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(wobs, whs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(whs)): cur_lr = lr.value() td_map = {train_model.wX:wobs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.wS] = states td_map[train_model.wM] = masks ''' you can add and run additional loss for VIB here for debugging, such as kl_loss ''' tloss, value_loss, policy_loss, policy_entropy, _ = sess.run( [loss, vf_loss, pg_loss, entropy, _train], feed_dict=td_map ) return tloss, value_loss, policy_loss, policy_entropy params = find_trainable_variables("model") def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.wvalue self.get_wh = step_model.get_wh self.initial_state = step_model.w_initial_state self.train = train self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, p, has_state): """ policy : Internal Policy model such as SnakeModel.CNNPolicy p : Hyperparameters required for training """ sess = tf_util.make_session() # Tensorflow model initiallization step_model = policy(sess=sess, p=p, train_phase=False, has_state=has_state) # Deploy model settings train_model = policy(sess=sess, p=p, train_phase=True, has_state=has_state) # Training model settings saver = tf.train.Saver() #Step 2 : Initialize the training parameters A = tf.placeholder(tf.int32, [p.N_BATCH]) ADV = tf.placeholder(tf.float32, [p.N_BATCH]) R = tf.placeholder(tf.float32, [p.N_BATCH]) LR = tf.placeholder(tf.float32, []) #Step 3 : Define the loss Function neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) # pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * p.ENTROPY_COEFF + vf_loss * p.VALUE_FUNC_COEFF #Step 4 : Define the loss optimizer params = find_trainable_variables("model") grads = tf.gradients(loss, params) if p.MAX_GRAD_NORM is not None: grads, grad_norm = tf.clip_by_global_norm( grads, p.MAX_GRAD_NORM ) # Clipping the gradients to protect learned weights grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=p.RMS_DECAY, epsilon=p.EPSILON) _train = trainer.apply_gradients( grads) # This is the variable which will be used lr = Scheduler(v=p.LEARNING_RATE, nvalues=p.N_TIMESTEPS, schedule=p.LEARNING_RATE_SCHEDULE ) # Learning rate changes linearly or as per arguments # Step 5 : Write down the summary parameters to be used writer = tf.summary.FileWriter(p.LOG_PATH) #summary writer def train(obs, rewards, masks, actions, values, states): """ obs : batch x n x m x 1 snake matrix rewards : batch x 1 rewards corrosponding to action actions : batch x 1 discrete action taken values : batch x 1 output of value function during the training process """ advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, train_model.S: states, A: actions, ADV: advs, R: rewards, LR: cur_lr } policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): #ps = sess.run(params) #make_path(save_path) #joblib.dump(ps, save_path) saver.save(sess, save_path) def load(load_path): #loaded_params = joblib.load(load_path) #restores = [] #for p, loaded_p in zip(params, loaded_params): # restores.append(p.assign(loaded_p)) #ps = sess.run(restores) saver.restore(sess, load_path) def add_scalar_summary(tag, value, step): summary = tf.Summary( value=[tf.Summary.Value(tag=tag, simple_value=value)]) writer.add_summary(summary, step) # Expose the user to closure functions self.train = train self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.hidden_value = step_model.hidden_value self.initial_state = step_model.initial_state self.add_scalar_summary = add_scalar_summary self.save = save self.load = load # Initialize global variables and add tf graph tf.global_variables_initializer().run(session=sess) writer.add_graph(tf.get_default_graph()) #write graph
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs*nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) # Defines step_model function and train_model functions # Pass each model a copy of 'sess' print("Constructing model... STEP_MODEL & TRAIN_MODEL: constructing step_model policy | " + str(policy)) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) # train_model takes in the mini-batch produced by 5 step_models, NOTE: reuse = true train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) # var init: this neglogpac is still somewhat unknown, # looks like it does softmax over policy layer of training model neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) print("MAIN: neglocpac = sparse_softmax_cross_entropy_with_logits() inputs: ") print("MAIN: train_model_pi: " + str(train_model.pi)) print("MAIN: labels: " + str(A)) # var init: policy gradient loss determined by average of all advantage * neglogpac pg_loss = tf.reduce_mean(ADV * neglogpac) # value function loss is mse(tf.squeeze(train_model.vf), R) # ^ in english, mse(model value prediction, actual Reward) # mse == means squared error, defined in a2c/utils.py vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) # entropy of policy entropy = tf.reduce_mean(cat_entropy(train_model.pi)) # total loss calculation? # todo: is this the loss function definition??? check with a3c paper loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef # params gets trainable variables from model (weights of network?) params = find_trainable_variables("model") # computes gradients (change of weights, or direction of weights) using 'loss' and 'params' above # computes 'symbolic derivatives of sum 'loss' w.r.t 'params' # from tflow docs: 'gradients() adds ops to the graph to output the derivs of 'params' grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) # TODO: how many gradients are computed here, should be 16 grads = list(zip(grads, params)) # RMSProp optimizes learning rate , check thesis notes trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) # RMSProp pushes back new gradients over trainable variables to change weights _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) writer = tf.summary.FileWriter("/tmp/helloTensorBoard.txt") writer.add_graph(sess.graph) # Trains the model, # TODO: What is 'masks' input param # TODO: How often does train_model (steps thru train_model) get run vs. step_model # A: I think it does a 'train_model' for each mini-batch, which is currently 5 steps # Does a sess.run with train_model def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() # td_map hooks up all inputs for train model? td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks # Policy Loss, Value Loss, and Policy Entropy calculations # Propagates losses backwards through the neural network? policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): path = logger.get_dir() + "/model.pkl" print("Logger dir: " + logger.get_dir()) print("MODEL SAVED TO : " + str(path)) ps = sess.run(params) #make_path(osp.dirname(save_path)) joblib.dump(ps, path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, action_space, nenvs, nsteps, ent_coef, vf_coef, max_grad_norm): sess = tf.get_default_session() # Here we create the placeholders actions_ = tf.placeholder(tf.int32, [None], name="actions_") advantages_ = tf.placeholder(tf.float32, [None], name="advantages_") rewards_ = tf.placeholder(tf.float32, [None], name="rewards_") lr_ = tf.placeholder(tf.float32, name="learning_rate_") # Here we create our two models: # Step_model that is used for sampling step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False) # Train model for training train_model = policy(sess, ob_space, action_space, nenvs*nsteps, nsteps, reuse=True) """ Calculate the loss Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss """ # Policy loss # Output -log(pi) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=actions_) # 1/n * sum A(si,ai) * -logpi(ai|si) pg_loss = tf.reduce_mean(advantages_ * neglogpac) # Value loss 1/2 SUM [R - V(s)]^2 vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf),rewards_)) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Build our trainer trainer = tf.train.RMSPropOptimizer(learning_rate=lr_, decay=0.99, epsilon=1e-5) # 4. Backpropagation _train = trainer.apply_gradients(grads) def train(states_in, actions, returns, values, lr): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # Returns = R + yV(s') advantages = returns - values # We create the feed dictionary td_map = {train_model.inputs_: states_in, actions_: actions, advantages_: advantages, # Use to calculate our policy loss rewards_: returns, # Use as a bootstrap for real value lr_: lr} policy_loss, value_loss, policy_entropy, _= sess.run([pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): """ Save the model """ saver = tf.train.Saver() saver.save(sess, save_path) def load(load_path): """ Load the model """ saver = tf.train.Saver() print('Loading ' + load_path) saver.restore(sess, load_path) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs*nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', replay_lambda=1, ss_rate=1, replay_loss=None): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs*nsteps # If we have replay_loss, create replay buffer and stage buffer # Use this to enforce replay loss lower if replay_loss is not None: self.replay_buffer = [] # holds all past data A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) # Introduce replay_loss if given if replay_loss == "L2": # Replace train_model.pi with whatever is predicted label # Replace A with whatever is recorded label re_loss = tf.nn.l2_loss(tf.nn.softmax(train_model.pi) - A) / nbatch elif replay_loss == "Distillation": # Replace y_donor with whatever is recorded label # Replace y_acceptor with whatever is predicted label re_loss = tf.reduce_mean( - tf.reduce_sum(tf.stop_gradient(y_donor) * tf.log(y_acceptor), reduction_indices=1)) loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef if replay_loss is not None: loss = loss + replay_lambda*re_loss params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nenvs, inter_op_parallelism_threads=nenvs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) self.saver = tf.train.Saver(max_to_keep=1000) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(path, steps): make_path(path) self.saver.save(sess, path + 'model', global_step=steps) def load(path, steps): self.saver = tf.train.import_meta_graph(path + 'model' + '-' + str(steps) + '.meta') self.saver.restore(sess, tf.train.latest_checkpoint(path)) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef, vf_coef, max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nbatch = nenvs * nsteps step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) A = train_model.pdtype.sample_placeholder([nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) eps = 1e-6 #nadv = ADV / (train_model.ret_rms.std + eps) #nr = (R - train_model.ret_rms.mean) / (train_model.ret_rms.std + eps) nadv = (ADV - train_model.ret_rms.mean) / (train_model.ret_rms.std + eps) nr = (R - train_model.ret_rms.mean) / (train_model.ret_rms.std + eps) nlogpac = -train_model.pd.logp(A) pg_loss = tf.reduce_mean(nadv * nlogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), nr)) #vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vnorm), nr)) entropy = tf.reduce_mean(train_model.pd.entropy()) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) avg_norm_ret = tf.reduce_mean(tf.abs(train_model.ret_rms.mean)) avg_norm_obs = tf.reduce_mean(tf.abs(train_model.ob_rms.mean)) def train(obs, states, returns, masks, actions, values): advs = returns - values #advs = (advs - np.mean(advs)) / (np.std(advs) + eps) for step in range(len(obs)): cur_lr = lr.value() if hasattr(train_model, "ob_rms"): train_model.ob_rms.update( sess, obs) # update running mean/std for observations of policy if hasattr(train_model, "ret_rms"): train_model.ret_rms.update( sess, returns) # # update running mean/std for returns td_map = { train_model.X: obs, A: actions, ADV: advs, R: returns, LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks ravg_norm_obs, policy_loss, value_loss, policy_entropy, _ = sess.run( [avg_norm_obs, pg_loss, vf_loss, entropy, _train], td_map) return ravg_norm_obs, policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(20e6), lrschedule='linear'): ''' sess = tf.get_default_session() nbatch = nenvs*nsteps step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) A = train_model.pdtype.sample_placeholder([nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) ''' # begin diff sess = tf.get_default_session() step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, reuse=True) L = tf.placeholder(tf.int32, [1]) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) # end diff neglogpac = train_model.pd.neglogp(A) # length max_episode_steps pg_loss = tf.reduce_mean(tf.slice(ADV * neglogpac, [0], L)) vf_loss = tf.reduce_mean(tf.slice(mse(tf.squeeze(train_model.vf), R), [0], L)) entropy = tf.reduce_mean(tf.slice(train_model.pd.entropy(), [0], L)) loss = pg_loss-entropy*ent_coef+vf_loss*vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values, length): advs = rewards-values for step in range(len(obs)): cur_lr = lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr, L:np.asarray([length])} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run([pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps writter = tf.summary.FileWriter( "/tmp/a2c_demo/1") # Change for SAT: this is to use tensorBoard A = tf.placeholder( tf.int32, [nbatch]) # Comments by Fei: this must be the action ADV = tf.placeholder( tf.float32, [nbatch]) # Comments by Fei: this must be the advantage R = tf.placeholder( tf.float32, [nbatch]) # Comments by Fei: this must be the reward LR = tf.placeholder( tf.float32, []) # Comments by Fei: this must be the learning rate step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) # Comments by Fei: pi is nbatch * nact pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) # writter.add_graph(sess.graph) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def build_model(self, max_grad_norm, reuse=tf.AUTO_REUSE): #reuse is true id loading #buld a 4 layer fc net for actor # X=tf.placeholder([-1,6,4]) states_in = tf.layers.flatten(self.states) with tf.variable_scope("model", reuse=reuse): a1 = tf.layers.dropout(inputs=tf.layers.dense( inputs=states_in, units=64, activation=tf.nn.relu), rate=0.3) self.a2 = tf.layers.dropout(tf.layers.dense(inputs=a1, units=128, activation=tf.nn.relu), rate=0.2) self.a3 = tf.layers.dropout(tf.layers.dense(inputs=self.a2, units=128, activation=tf.nn.relu), rate=0.1) self.out = tf.layers.dense( inputs=self.a3, units=4, activation=tf.nn.relu, kernel_initializer=tf.orthogonal_initializer(np.sqrt(2))) self.value = tf.layers.dense(inputs=self.a3, units=1, activation=None) # # self.pd, self.pi = self.pdtype.pdfromlatent(self.out, init_scale=0.01) # with baselines from openai self.pd, self.pi, _ = self.pdtype.proba_distribution_from_latent( self.out, self.value, init_scale=0.01 ) # with stable_baselines see https://stable-baselines.readthedocs.io/en/master/common/distributions.html?highlight=vf%20latent%20vector # self.pd, self.pi = self.pdtype.pdfromlatent(self.out, init_scale=0.01) self.a0 = self.pd.sample() #calculate the loss function neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.pi, labels=self.actions) # 1/n * sum A(si,ai) * -logpi(ai|si) pg_loss = tf.reduce_mean(self.advantages * neglogpac) # Value loss 1/2 SUM [R - V(s)]^2 vf_loss = tf.reduce_mean(mse(tf.squeeze(self.value), self.rewards)) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(self.pd.entropy()) self.loss = pg_loss - entropy * self.Entropy_coefficient + vf_loss * self.vf_coef # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("model") # 2. Calculate the gradients grads = tf.gradients(self.loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Build our trainer trainer = tf.train.RMSPropOptimizer(learning_rate=self.lr, decay=0.99, epsilon=1e-5) # 4. Backpropagation self.train_op = trainer.apply_gradients(grads)
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV * logpac) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape( train_model.vf)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, PG_LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nscripts=16, nsteps=20, nstack=4, ent_coef=0.1, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.001, kfac_clip=0.001, lrschedule='linear', alpha=0.99, epsilon=1e-5): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nsml.bind(sess=sess) #nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) XY0 = tf.placeholder(tf.int32, [nbatch]) XY1 = tf.placeholder(tf.int32, [nbatch]) # ADV == TD_TARGET - values ADV = tf.placeholder(tf.float32, [nbatch]) TD_TARGET = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) # Policy 1 : Base Action : train_model.pi label = A script_mask = tf.concat([ tf.zeros([nscripts * nsteps, 1]), tf.ones([(nprocs - nscripts) * nsteps, 1]) ], axis=0) pi = train_model.pi pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0 pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi, labels=A) neglogpac *= tf.stop_gradient(pac_weight) inv_A = 1.0 - tf.cast(A, tf.float32) xy0_mask = tf.cast(A, tf.float32) xy1_mask = tf.cast(A, tf.float32) condition0 = tf.equal(xy0_mask, 2) xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask) xy0_mask = 1.0 - xy0_mask condition1 = tf.equal(xy1_mask, 2) xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask) # One hot representation of chosen marine. # [batch_size, 2] pi_xy0 = train_model.pi_xy0 pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0 pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024), axis=1) logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy0, labels=XY0) logpac_xy0 *= tf.stop_gradient(pac_weight) logpac_xy0 *= tf.cast(xy0_mask, tf.float32) pi_xy1 = train_model.pi_xy1 pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0 pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024), axis=1) # 1D? 2D? logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy1, labels=XY1) logpac_xy1 *= tf.stop_gradient(pac_weight) logpac_xy1 *= tf.cast(xy1_mask, tf.float32) pg_loss = tf.reduce_mean(ADV * neglogpac) pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0) pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1) vf_ = tf.squeeze(train_model.vf) vf_r = tf.concat([ tf.ones([nscripts * nsteps, 1]), tf.zeros([(nprocs - nscripts) * nsteps, 1]) ], axis=0) * TD_TARGET vf_masked = vf_ * script_mask + vf_r #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps] vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET)) entropy_a = tf.reduce_mean(cat_entropy(train_model.pi)) entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0)) entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1)) entropy = entropy_a + entropy_xy0 + entropy_xy1 loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) self.logits = logits = train_model.pi # xy0 self.params_common = params_common = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') self.params_xy0 = params_xy0 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy0 = grads_xy0 = tf.gradients( train_loss_xy0, params_xy0) if max_grad_norm is not None: grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm) grads_xy0 = list(zip(grads_xy0, params_xy0)) trainer_xy0 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy0 = trainer_xy0.apply_gradients(grads_xy0) # xy1 self.params_xy1 = params_xy1 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy1 = grads_xy1 = tf.gradients( train_loss_xy1, params_xy1) if max_grad_norm is not None: grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm) grads_xy1 = list(zip(grads_xy1, params_xy1)) trainer_xy1 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy1 = trainer_xy1.apply_gradients(grads_xy1) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, td_targets, masks, actions, xy0, xy1, values): advs = td_targets - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, XY0: xy0, XY1: xy1, ADV: advs, TD_TARGET: td_targets, PG_LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _, \ policy_loss_xy0, policy_entropy_xy0, _, \ policy_loss_xy1, policy_entropy_xy1, _ = sess.run( [pg_loss, vf_loss, entropy, _train, pg_loss_xy0, entropy_xy0, _train_xy0, pg_loss_xy1, entropy_xy1, _train_xy1], td_map) return policy_loss, value_loss, policy_entropy, \ policy_loss_xy0, policy_entropy_xy0, \ policy_loss_xy1, policy_entropy_xy1 def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state print("global_variables_initializer start") tf.global_variables_initializer().run(session=sess) print("global_variables_initializer complete")
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(20e6), lrschedule='linear'): sess = tf.get_default_session() nbatch = nenvs * nsteps step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) A = train_model.pdtype.sample_placeholder([nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(train_model.pd.entropy()) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, discounted_rewards, rewards, masks, prev_actions, actions, values, dones): advs = discounted_rewards - values for step in range(len(obs)): cur_lr = lr.value() # reshape actions, rewards, and dones to have first dimension of size nenvs*nsteps, existing second dimension # this is already done for obs rews = np.reshape(rewards, (nbatch, 1)) ds = np.reshape(np.asarray(dones, dtype=np.float32), (nbatch, 1)) if len(ac_space.shape) == 0: prev_actions = np.reshape(prev_actions, (nbatch, )) one_hot = np.eye(ac_space.n)[prev_actions] for i in range(nbatch): if prev_actions[i] == -1: one_hot[i, :] = np.zeros((ac_space.n, ), dtype=np.int) x = np.concatenate((obs, one_hot, rews, ds), axis=1) actions = np.reshape(actions, (nbatch, )) else: prev_actions = np.reshape(prev_actions, (nbatch, ac_space.shape[0])) x = np.concatenate((obs, prev_actions, rews, ds), axis=1) td_map = { train_model.X: x, A: actions, ADV: advs, R: discounted_rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)