def a2c_loss(pi, vf): neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) # ent_coef_mode = hparams.get('ent_coef_mode', 'default') # ent_coef_val = hparams.get('ent_coef_val', ent_coef) # if ent_coef_mode == 'default': # actual_ent_coef = ent_coef_val # elif ent_coef_mode == 'linear_teacher': # actual_ent_coef = ent_coef_val * TEACHER_C + ent_coef * (1 - TEACHER_C) # elif ent_coef_mode == 'additive_teacher': # actual_ent_coef = ent_coef_val + ent_coef_val * TEACHER_C # else: # raise Exception('unrecognized ent_coef_mode: {}'.format(ent_coef_mode)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef return loss, pg_loss, vf_loss, entropy
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613 self.pdtype = make_pdtype(ac_space) #X, processed_x = observation_input(ob_space, nbatch) X, processed_x = observation_input(ob_space, None) with tf.variable_scope("model", reuse=reuse): h = nature_cnn(processed_x, **conv_kwargs) vf = fc(h, 'v', 1)[:, 0] self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01) a0 = self.pd.sample() neglogp0 = self.pd.neglogp(a0) self.initial_state = None self.entropy = cat_entropy(self.pi) def step(ob, *_args, **_kwargs): a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob}) return a, v, self.initial_state, neglogp def value(ob, *_args, **_kwargs): return sess.run(vf, {X: ob}) def neg_log_prob(actions): return tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.pi, labels=actions) self.X = X self.vf = vf self.step = step self.value = value self.neg_log_prob = neg_log_prob
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', logdir=None): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs*nsteps ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=train_model.a0) entropy = tf.reduce_sum(cat_entropy(train_model.pi)) params = find_trainable_variables("model") tf.summary.histogram("vf", train_model.vf) tf.summary.histogram("R", R) if train_model.relaxed: pg_loss = tf.constant(0.0) oh_A = tf.one_hot(train_model.a0, ac_space.n) params = find_trainable_variables("model") policy_params = [v for v in params if "pi" in v.name] vf_params = [v for v in params if "vf" in v.name] entropy_grads = tf.gradients(entropy, policy_params) ddiff_loss = tf.reduce_sum(train_model.vf - train_model.vf_t) ddiff_grads = tf.gradients(ddiff_loss, policy_params) sm = tf.nn.softmax(train_model.pi) dlogp_dpi = oh_A * (1. - sm) + (1. - oh_A) * (-sm) pi_grads = -((tf.expand_dims(R, 1) - train_model.vf_t) * dlogp_dpi) pg_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads) pg_grads = [pg - dg for pg, dg in zip(pg_grads, ddiff_grads)] pi_param_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads) cv_grads = tf.concat([tf.reshape(p, [-1]) for p in pg_grads], 0) cv_grad_splits = tf.reduce_sum(tf.square(cv_grads)) vf_loss = cv_grad_splits * vf_coef cv_grads = tf.gradients(vf_loss, vf_params) policy_grads = [] for e_grad, p_grad, param in zip(entropy_grads, pg_grads, policy_params): grad = -e_grad * ent_coef + p_grad policy_grads.append(grad) grad_dict = {} for g, v in list(zip(policy_grads, policy_params))+list(zip(cv_grads, vf_params)): grad_dict[v] = g grads = [grad_dict[v] for v in params] print(grads) else: pg_loss = tf.reduce_sum((tf.stop_gradient(R) - tf.stop_gradient(train_model.vf)) * neglogpac) policy_params = [v for v in params if "pi" in v.name] pg_grads = tf.gradients(pg_loss, policy_params) vf_loss = tf.reduce_sum(mse(tf.squeeze(train_model.vf), R)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef grads = tf.gradients(loss, params) grads = list(zip(grads, params)) ema = tf.train.ExponentialMovingAverage(.99) all_policy_grads = tf.concat([tf.reshape(g, [-1]) for g in pg_grads], 0) all_policy_grads_sq = tf.square(all_policy_grads) apply_mean_op = ema.apply([all_policy_grads, all_policy_grads_sq]) em_mean = ema.average(all_policy_grads) em_mean_sq = ema.average(all_policy_grads_sq) em_var = em_mean_sq - tf.square(em_mean) em_log_var = tf.log(em_var + 1e-20) mlgv = tf.reduce_mean(em_log_var) for g, v in grads: print(v.name, g) tf.summary.histogram(v.name, v) tf.summary.histogram(v.name+"_grad", g) self.sum_op = tf.summary.merge_all() self.writer = tf.summary.FileWriter(logdir) trainer = tf.train.AdamOptimizer(learning_rate=LR, beta2=.99999) with tf.control_dependencies([apply_mean_op]): _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) self._step = 0 def train(obs, states, rewards, masks, u1, u2, values, summary=False): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X:obs, train_model.U1:u1, train_model.U2:u2, ADV:advs, R:rewards, LR:cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks if summary: sum_str, policy_loss, value_loss, policy_entropy, lv, _ = sess.run( [self.sum_op, pg_loss, vf_loss, entropy, mlgv, _train], td_map ) self.writer.add_summary(sum_str, self._step) else: policy_loss, value_loss, policy_entropy, lv, _ = sess.run( [pg_loss, vf_loss, entropy, mlgv, _train], td_map ) self._step += 1 return policy_loss, value_loss, policy_entropy, lv def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', replay_lambda=1, ss_rate=1, replay_loss=None): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs*nsteps # If we have replay_loss, create replay buffer and stage buffer # Use this to enforce replay loss lower if replay_loss is not None: self.replay_buffer = [] # holds all past data A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) # Introduce replay_loss if given if replay_loss == "L2": # Replace train_model.pi with whatever is predicted label # Replace A with whatever is recorded label re_loss = tf.nn.l2_loss(tf.nn.softmax(train_model.pi) - A) / nbatch elif replay_loss == "Distillation": # Replace y_donor with whatever is recorded label # Replace y_acceptor with whatever is predicted label re_loss = tf.reduce_mean( - tf.reduce_sum(tf.stop_gradient(y_donor) * tf.log(y_acceptor), reduction_indices=1)) loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef if replay_loss is not None: loss = loss + replay_lambda*re_loss params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, cell=256, sv_M=32, algo='regular', ib_alpha=1e-3): sess = tf_util.make_session() act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, 1, cell=cell, M=sv_M, model='step_model', algo=algo) train_model = policy(sess, ob_space, ac_space, nbatch_train, 1, nsteps, cell=cell, M=sv_M, model='train_model', algo=algo) A = train_model.wpdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC_expand = tf.placeholder(tf.float32, [None, sv_M]) OLDVPRED = tf.placeholder(tf.float32, [None]) OLDVPRED_expand = tf.placeholder(tf.float32, [None, sv_M]) LR = tf.placeholder(tf.float32, []) CLIPRANGE = tf.placeholder(tf.float32, []) if algo == 'use_svib_uniform' or algo == 'use_svib_gaussian': def expand_placeholder(X, M=sv_M): return tf.tile(tf.expand_dims(X, axis=-1), [1, M]) A_expand, R_expand = expand_placeholder(A), expand_placeholder(R) neglogpac_expand = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.wpi_expand, labels=A_expand) #shape=[nbatch, sv_M] entropy_expand = tf.reduce_mean(cat_entropy( train_model.wpi_expand), axis=-1) #shape=[nbatch] vpred_expand = train_model.wvf_expand[:, :, 0] vpredclipped_expand = OLDVPRED_expand + tf.clip_by_value( train_model.wvf_expand[:, :, 0] - OLDVPRED_expand, -CLIPRANGE, CLIPRANGE) vf_loss1_expand = tf.square(vpred_expand - R_expand) vf_loss2_expand = tf.square(vpredclipped_expand - R_expand) vf_loss_expand = .5 * tf.reduce_mean(tf.maximum( vf_loss1_expand, vf_loss2_expand), axis=-1) #shape = [nbatch] ratio_expand = tf.exp(OLDNEGLOGPAC_expand - neglogpac_expand) ADV_expand = R_expand - OLDVPRED_expand # ADV_expand_mean, ADV_expand_var = tf.nn.moments(ADV_expand, axes=0, keep_dims=True)#shape = [1,sv_M] ADV_expand_mean, ADV_expand_var = tf.nn.moments( ADV_expand, axes=[0, 1]) #shape = [1,sv_M] ADV_expand_normal = (ADV_expand - ADV_expand_mean) / ( tf.sqrt(ADV_expand_var) + 1e-8) pg_losses_expand = -ADV_expand_normal * ratio_expand pg_losses2_expand = -ADV_expand_normal * tf.clip_by_value( ratio_expand, 1. - CLIPRANGE, 1. + CLIPRANGE) pg_loss_expand = tf.reduce_mean(tf.maximum(pg_losses_expand, pg_losses2_expand), axis=-1) J_theta = -(pg_loss_expand + vf_coef * vf_loss_expand - ent_coef * entropy_expand) loss_expand = -J_theta / float(nbatch_train) pg_loss_expand_ = tf.reduce_mean(pg_loss_expand) vf_loss_expand_ = tf.reduce_mean(vf_loss_expand) entropy_expand_ = tf.reduce_mean(entropy_expand) log_p_grads = tf.gradients( J_theta / np.sqrt(ib_alpha), [train_model.wh_expand])[0] #shape=[nbatch, sv_M, cell] if algo == 'use_svib_gaussian': mean, var = tf.nn.moments( train_model.wh_expand, axes=1, keep_dims=True) #shape=[nbatch, 1,cell] gaussian_grad = -(train_model.wh_expand - mean) / ( float(sv_M) * (var + 1e-3)) log_p_grads += 5e-3 * ( tf_l2norm(log_p_grads, axis=-1, keep_dims=True) / tf_l2norm(gaussian_grad, axis=-1, keep_dims=True)) * gaussian_grad sv_grads = tf.constant(0., tf.float32, shape=[nbatch_train, 0, cell]) exploit_total_norm_square = 0 explore_total_norm_square = 0 explore_coef = 1. if env_name == 'SeaquestNoFrameskip-v4': explore_coef = 0.01 elif env_name in [ 'AirRaidNoFrameskip-v4,' 'BreakoutNoFrameskip-v4', 'AtlantisNoFrameskip-v4', 'StarGunnerNoFrameskip-v4', 'AsteroidsNoFrameskip-v4', 'YarsRevengeNoFrameskip-v4' ]: explore_coef = 0. print('env_name:', env_name, 'explore_coef: ', explore_coef) for i in range(sv_M): exploit = tf.reduce_sum(train_model.rpf_matrix[:, :, i:i + 1] * log_p_grads, axis=1) explore = np.sqrt( ib_alpha) * explore_coef * train_model.rpf_grads[:, i, :] exploit_total_norm_square += tf.square( tf_l2norm(exploit, axis=-1, keep_dims=False)) explore_total_norm_square += tf.square( tf_l2norm(explore, axis=-1, keep_dims=False)) sv_grad = exploit + explore #shape=[nbatch, cell] sv_grads = tf.concat( [sv_grads, tf.expand_dims(sv_grad, axis=1)], axis=1) SV_GRADS = tf.placeholder(tf.float32, [nbatch_train, sv_M, cell]) repr_loss = tf.reduce_mean(SV_GRADS * train_model.wh_expand, axis=1) #shape=[nbatch,cell] repr_loss = -tf.reduce_mean(tf.reduce_sum( repr_loss, axis=-1)) #max optimization problem to minimization problem #op for debugging and visualization exploit_explore_ratio = tf.sqrt( exploit_total_norm_square / tf.maximum(explore_total_norm_square, 0.01))[0] # rpf_mat = tf.expand_dims(train_model.rpf_matrix, axis=-1) # log_p_grads_tile = tf.tile(tf.expand_dims(log_p_grads, axis=2), [1,1,sv_M,1]) # exploit = tf.reduce_sum(rpf_mat*log_p_grads_tile, axis=1) # explore = np.sqrt(ib_alpha) * train_model.rpf_grads # sv_grads = exploit + explore # ind = 1 # exploit = tf.reduce_sum(train_model.rpf_matrix[:, :, i:i + 1] * log_p_grads, axis=1) # explore = train_model.rpf_grads[:, i, :] # clip_coef = tf_l2norm(exploit, axis=-1, keep_dims=True) # explore_norm = tf_l2norm(explore, axis=-1, keep_dims=True) # explore = explore * 1e-2 * clip_coef / tf.maximum(explore_norm, clip_coef) # sv_grad = exploit + np.sqrt(ib_alpha) * explore # shape=[nbatch, cell] grads_expand, grad_norm_expand = grad_clip(loss_expand, max_grad_norm, ['model/worker_module']) trainer_expand = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) _train_expand = trainer_expand.apply_gradients(grads_expand) repr_grads, repr_global_norm = grad_clip( repr_loss, max_grad_norm, ['model/ordinary_encoder']) repr_trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) _repr_train = repr_trainer.apply_gradients(repr_grads) else: print('env_name:', env_name) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.wpi, labels=A) entropy = tf.reduce_mean(cat_entropy(train_model.wpi)) vpred = train_model.wvf[:, 0] vpredclipped = OLDVPRED + tf.clip_by_value( train_model.wvf[:, 0] - OLDVPRED, -CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef grads, grad_norm = grad_clip( loss, max_grad_norm, ['model/worker_module', 'model/ordinary_encoder']) trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) _train = trainer.apply_gradients(grads) with tf.variable_scope('model'): params = tf.trainable_variables() def generate_old_expand_data(obs, noises, masks, actions, states=None): noises_expand = sess.run(train_model.noise_expand) repr_td_map = { train_model.wX: obs, train_model.istraining: False, A: actions, train_model.noise_expand: noises_expand, train_model.NOISE_KEEP: noises } if states is not None: repr_td_map[train_model.wS] = states repr_td_map[train_model.wM] = masks neglogpacs_expand, vpreds_expand = \ sess.run([neglogpac_expand, vpred_expand], feed_dict=repr_td_map) shape = noises_expand.shape noises_expand = noises_expand.reshape(nbatch_train, sv_M - 1, *shape[1:]) return [noises_expand, neglogpacs_expand, vpreds_expand] def train(lr, cliprange, obs, noises, returns, masks, actions, values, neglogpacs, noises_expand=None, neglogpacs_expand=None, vpreds_expand=None, states=None): advs = returns - values advs = (advs - advs.mean()) / (advs.std() + 1e-8) if algo == 'use_svib_uniform' or algo == 'use_svib_gaussian': shape = noises_expand.shape noises_expand_ = noises_expand.reshape( nbatch_train * (sv_M - 1), *shape[2:]) # print(noises_expand_.shape) repr_td_map = { train_model.wX: obs, train_model.istraining: True, A: actions, R: returns, LR: lr, CLIPRANGE: cliprange, train_model.noise_expand: noises_expand_, train_model.NOISE_KEEP: noises, OLDNEGLOGPAC_expand: neglogpacs_expand, OLDVPRED_expand: vpreds_expand } rl_td_map = { train_model.istraining: True, A: actions, R: returns, LR: lr, CLIPRANGE: cliprange } if states is not None: if algo == 'use_svib_uniform' or algo == 'use_svib_gaussian': repr_td_map[train_model.wS] = states repr_td_map[train_model.wM] = masks rl_td_map[train_model.wS] = states rl_td_map[train_model.wM] = masks if algo == 'use_svib_uniform' or algo == 'use_svib_gaussian': sv_gradients, whs_expand, ir_ratio = sess.run( [sv_grads, train_model.wh_expand, exploit_explore_ratio], feed_dict=repr_td_map) rl_td_map[OLDNEGLOGPAC_expand], rl_td_map[ OLDVPRED_expand], rl_td_map[ train_model. wh_expand] = neglogpacs_expand, vpreds_expand, whs_expand value_loss, policy_loss, policy_entropy, _, rl_grad_norm = sess.run( [ vf_loss_expand_, pg_loss_expand_, entropy_expand_, _train_expand, grad_norm_expand ], feed_dict=rl_td_map) repr_td_map[SV_GRADS] = sv_gradients repr_grad_norm, represent_loss, __ = sess.run( [repr_global_norm, repr_loss, _repr_train], feed_dict=repr_td_map) else: rl_td_map[train_model.wX], rl_td_map[ train_model. noise] = obs, noises #noise won't be used when algo is 'regular' rl_td_map[OLDNEGLOGPAC], rl_td_map[OLDVPRED], rl_td_map[ ADV] = neglogpacs, values, advs value_loss, policy_loss, policy_entropy, _, rl_grad_norm = sess.run( [vf_loss, pg_loss, entropy, _train, grad_norm], feed_dict=rl_td_map) represent_loss, rpf_norm_, rpf_grad_norm_, sv_gradients, ir_ratio, repr_grad_norm = 0., 0., 0., 0., 0, 0. return policy_loss, value_loss, policy_entropy, represent_loss, ir_ratio, rl_grad_norm, repr_grad_norm self.loss_names = [ 'policy_loss', 'value_loss', 'policy_entropy', 'represent_loss', 'exploit_explore_ratio', 'rl_grad_norm', 'repr_grad_norm' ] def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) # If you want to load weights, also save/load observation scaling inside VecNormalize self.generate_old_expand_data = generate_old_expand_data self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.wvalue self.initial_state = act_model.w_initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, lambda_dist=0.01, total_timesteps=None, lrschedule='linear'): sess = tf.get_default_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) config = Config() act_model = policy(config) config.reuse = True train_model = policy(config) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.logits, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.logits)) aux_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.rp_logits, labels=A) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + aux_loss * lambda_dist params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) saver = tf.train.Saver() def train(obs, rs, rr, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr, train_model.inputs_s: rs, train_model.inputs_r: rr } policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): saver.save(sess, save_path + 'model.ckpt') def load(load_path): saver.restore(sess, load_path + 'model.ckpt') self.train = train self.train_model = train_model self.act_model = act_model self.act = act_model.act self.value = act_model.value self.save = save self.load = load
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', continuous_actions=False, debug=False, numAgents=2, itr=1, particleEnv=False, communication=False): self.continuous_actions = continuous_actions self.nenvs = nenvs print('vf_coef', vf_coef) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) # print('action space: ', ac_space) if particleEnv == False: nact = ac_space.n elif communication == False: nact = ac_space[itr].n else: nact = ac_space[itr].high - ac_space[itr].low # modified self.nact = nact # print('nact: ', nact) # print(nact) nbatch = nenvs * nsteps # print(nbatch) # print('batch size: ', nbatch) if self.continuous_actions: A = tf.placeholder(tf.float32, [nbatch]) elif particleEnv == False or communication == False: A = tf.placeholder(tf.int32, [nbatch]) else: actions_per_agent = 2 A = tf.placeholder(tf.int32, [actions_per_agent, nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) if particleEnv == False: step_model = policy( sess, ob_space, ac_space, nenvs, 1, nstack, reuse=tf.AUTO_REUSE, continuous_actions=continuous_actions) #, itr=itr) train_model = policy( sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=tf.AUTO_REUSE, continuous_actions=continuous_actions) #, itr=itr) elif communication == False: # print('step model') step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False, continuous_actions=continuous_actions, itr=itr, communication=communication) # print('train model') train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=tf.AUTO_REUSE, continuous_actions=continuous_actions, itr=itr, communication=communication) else: step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=tf.AUTO_REUSE, continuous_actions=continuous_actions, itr=itr, communication=communication) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=tf.AUTO_REUSE, continuous_actions=continuous_actions, itr=itr, communication=communication) # else: # else: # step_model = [] # train_model = [] # for i in range(numAgents): # step_model.append(policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=tf.AUTO_REUSE, continuous_actions=continuous_actions)) # train_model.append(policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True, continuous_actions=continuous_actions)) # print(train_model) if self.continuous_actions: neglogpac = tf.log(mse(train_model.mu, A)) elif particleEnv == False or communication == False: # print('A: ', A) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = tf.reduce_mean(ADV * neglogpac) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef else: neglogpac = [] entropy = [] pg_loss = [] loss = [] vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) neglogpac_ = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi_c, labels=A[0]) entropy_ = tf.reduce_mean(cat_entropy(train_model.pi_c)) pg_loss_ = tf.reduce_mean(ADV * neglogpac_) entropy.append(entropy_) pg_loss.append(pg_loss_) loss.append(pg_loss_ - entropy_ * ent_coef + vf_loss * vf_coef) neglogpac_ = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi_u, labels=A[1]) entropy_ = tf.reduce_mean(cat_entropy(train_model.pi_u)) pg_loss_ = tf.reduce_mean(ADV * neglogpac_) entropy.append(entropy_) pg_loss.append(pg_loss_) loss.append(pg_loss_ - entropy_ * ent_coef + vf_loss * vf_coef) params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # f itr == 0: # trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = tf.train.AdamOptimizer( learning_rate=LR, name=str(itr) ).apply_gradients( grads ) # , decay=alpha, epsilon=epsilon, name=str(itr)).apply_gradients(grads) # _train = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon, name=str(itr)).apply_gradients(grads) # Error here lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values, debug=False, numAgents=2): # print('train rewards and values') # print(actions[0]) # print(actions[1]) # print(rewards) # print(values) advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } # if states != []: if train_model.initial_state != []: # print(states) td_map[train_model.S] = states td_map[train_model.M] = masks if debug == True: policy_loss, value_loss, policy_entropy, all_grad_vals, _ = sess.run( [pg_loss, vf_loss, entropy, grads, _train], td_map) # grad_vals = [(np.min(grad_vals), np.max(grad_vals), np.sum(grad_vals)) for grad_vals in all_grad_vals] # print('Policy Gradients: ') # print(all_grad_vals[9]) # print('Value Gradients: ') # print(all_grad_vals[11]) print('Gradient Values: ') print(all_grad_vals) else: policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) # else: # td_map = [] # print('Train Model in train') # print(train_model) # for i in range(numAgents): # td_map = {train_model[i].X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} # if train_model[i].initial_state != []: # print('states') # print(states) # td_map[train_model[i].S] = states # td_map[train_model[i].M] = masks # if debug: # print('point1') # policy_loss, value_loss, policy_entropy, all_grad_vals, _ = sess.run( # [pg_loss, vf_loss, entropy, grads, _train], # td_map # ) # print('point2') # grad_vals = [(np.min(grad_vals), np.max(grad_vals), np.sum(grad_vals)) for grad_vals in all_grad_vals] # print('Policy Gradients: ') # print(all_grad_vals[9]) # print('Value Gradients: ') # print(all_grad_vals[11]) # else: # policy_loss, value_loss, policy_entropy, _ = sess.run( # [pg_loss, vf_loss, entropy, _train], # td_map # ) # print('Policy Loss: ') # print(policy_loss) # print('Value Loss: ') # print(value_loss) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) #make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model # if numAgents == 1: self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state # else: # self.step = [] # self.value = [] # self.initial_state = [] # for i in range(numAgents): # self.step.append(step_model[i].step) # self.value.append(step_model[i].value) # self.initial_state.append(step_model[i].initial_state) self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV * logpac) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape( train_model.vf)) self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean( tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params = params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss, params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads, params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, PG_LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, v_mix_coef=0.5, max_grad_norm=0.5, lr_alpha=7e-4, lr_beta=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', r_ex_coef=1.0, r_in_coef=0.0, v_ex_coef=1.0): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch], 'A') R_EX = tf.placeholder(tf.float32, [nbatch], 'R_EX') ADV_EX = tf.placeholder(tf.float32, [nbatch], 'ADV_EX') RET_EX = tf.placeholder(tf.float32, [nbatch], 'RET_EX') V_MIX = tf.placeholder(tf.float32, [nbatch], 'V_MIX') DIS_V_MIX_LAST = tf.placeholder(tf.float32, [nbatch], 'DIS_V_MIX_LAST') COEF_MAT = tf.placeholder(tf.float32, [nbatch, nbatch], 'COEF_MAT') LR_ALPHA = tf.placeholder(tf.float32, [], 'LR_ALPHA') LR_BETA = tf.placeholder(tf.float32, [], 'LR_BETA') step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) r_mix = r_ex_coef * R_EX + r_in_coef * tf.reduce_sum( train_model.r_in * tf.one_hot(A, nact), axis=1) ret_mix = tf.squeeze( tf.matmul(COEF_MAT, tf.reshape(r_mix, [nbatch, 1])), [1]) + DIS_V_MIX_LAST adv_mix = ret_mix - V_MIX neglogpac = train_model.pd.neglogp(A) pg_mix_loss = tf.reduce_mean(adv_mix * neglogpac) v_mix_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_mix), ret_mix)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) policy_loss = pg_mix_loss - ent_coef * entropy + v_mix_coef * v_mix_loss policy_params = train_model.policy_params policy_grads = tf.gradients(policy_loss, policy_params) if max_grad_norm is not None: policy_grads, policy_grad_norm = tf.clip_by_global_norm( policy_grads, max_grad_norm) policy_grads_and_vars = list(zip(policy_grads, policy_params)) policy_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_ALPHA, decay=alpha, epsilon=epsilon) policy_train = policy_trainer.apply_gradients(policy_grads_and_vars) rmss = [policy_trainer.get_slot(var, 'rms') for var in policy_params] policy_params_new = {} for grad, rms, var in zip(policy_grads, rmss, policy_params): ms = rms + (tf.square(grad) - rms) * (1 - alpha) policy_params_new[ var.name] = var - LR_ALPHA * grad / tf.sqrt(ms + epsilon) policy_new = train_model.policy_new_fn(policy_params_new, ob_space, ac_space, nbatch, nsteps) neglogpac_new = policy_new.pd.neglogp(A) ratio_new = tf.exp(tf.stop_gradient(neglogpac) - neglogpac_new) pg_ex_loss = tf.reduce_mean(-ADV_EX * ratio_new) v_ex_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_ex), RET_EX)) intrinsic_loss = pg_ex_loss + v_ex_coef * v_ex_loss intrinsic_params = train_model.intrinsic_params intrinsic_grads = tf.gradients(intrinsic_loss, intrinsic_params) if max_grad_norm is not None: intrinsic_grads, intrinsic_grad_norm = tf.clip_by_global_norm( intrinsic_grads, max_grad_norm) intrinsic_grads_and_vars = list(zip(intrinsic_grads, intrinsic_params)) intrinsic_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_BETA, decay=alpha, epsilon=epsilon) intrinsic_train = intrinsic_trainer.apply_gradients( intrinsic_grads_and_vars) lr_alpha = Scheduler(v=lr_alpha, nvalues=total_timesteps, schedule=lrschedule) lr_beta = Scheduler(v=lr_beta, nvalues=total_timesteps, schedule=lrschedule) all_params = tf.global_variables() def train(obs, policy_states, masks, actions, r_ex, ret_ex, v_ex, v_mix, dis_v_mix_last, coef_mat): advs_ex = ret_ex - v_ex for step in range(len(obs)): cur_lr_alpha = lr_alpha.value() cur_lr_beta = lr_beta.value() td_map = { train_model.X: obs, policy_new.X: obs, A: actions, R_EX: r_ex, ADV_EX: advs_ex, RET_EX: ret_ex, V_MIX: v_mix, DIS_V_MIX_LAST: dis_v_mix_last, COEF_MAT: coef_mat, LR_ALPHA: cur_lr_alpha, LR_BETA: cur_lr_beta } if policy_states is not None: td_map[train_model.PS] = policy_states td_map[train_model.M] = masks return sess.run([entropy, policy_train, intrinsic_train], td_map)[0] def save(save_path): ps = sess.run(all_params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(all_params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.intrinsic_reward = step_model.intrinsic_reward self.init_policy_state = step_model.init_policy_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', hparams=None): assert hparams != None hparams['_vf_coef'] = vf_coef # Create the session. sess = tf_util.make_session( per_process_gpu_memory_fraction=hparams.get('gpu_fraction', 0.25)) self.sess = sess # Copy hparams. self.hparams = hparams self.nenvs = nenvs self.nsteps = nsteps self.hparams['batch_size'] = nenvs * nsteps # Setup constants. nact = ac_space.n nbatch = nenvs * nsteps self.nbatch = nbatch nh, nw, nc = ob_space.shape ob_shape_train = (nbatch, nh, nw, nc) ob_shape_step = (nenvs, nh, nw, nc) # Setup placeholders. A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) TEACHER_C = tf.placeholder(tf.float32, []) DROPOUT_STRENGTH = tf.placeholder(tf.float32, [], name='DROPOUT_STRENGTH') self.DROPOUT_STRENGTH = DROPOUT_STRENGTH X_train = tf.placeholder(tf.float32, ob_shape_train, name='Ob_train') #obs X_step = tf.placeholder(tf.float32, ob_shape_step, name='Ob_step') #obs attention_truth = None step_hparams = copy.deepcopy(hparams) train_hparams = copy.deepcopy(hparams) # if self.hparams.get('fixed_dropout_noise'): # self.step_env_random = tf.get_variable( # shape=[nenvs, 7, 7, 1], # name='env_random', # initializer=tf.truncated_normal_initializer(), # trainable=False, # ) # self.train_env_random = tf.tile(tf.expand_dims(self.step_env_random, axis=0), multiples=[nsteps, 1, 1, 1, 1]) # self.train_env_random = tf.reshape( # tf.transpose(self.train_env_random, perm=[1, 0, 2, 3, 4]), # [nbatch, 7, 7, 1]) # step_hparams['_env_random'] = self.step_env_random # train_hparams['_env_random'] = self.train_env_random # train_hparams['_dropout_strength'] = DROPOUT_STRENGTH # step_hparams['_dropout_strength'] = DROPOUT_STRENGTH # Create the models. step_model = policy(sess, X_step, ob_space, ac_space, nenvs, 1, reuse=False, hparams=step_hparams) train_model = policy(sess, X_train, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True, hparams=train_hparams) if hparams.get('teacher_ckpt'): assert hparams.get('use_fixed_attention') or hparams.get( 'learn_attention_from_teacher') or hparams.get( 'do_joint_training') # Create the teacher, so that way we can use its attention weights # instead of learning how to do attention on our own. # step_teacher = self._create_sfmnet(X_step, reuse=False, is_step_model=True) train_teacher = self._create_object_segmentation_net( X_train, reuse=False, is_step_model=False, embedding=train_model.original_h if hparams['do_joint_training'] else None, ) train_attention_truth, train_attention_mask = self._get_attention_truth( train_teacher, is_step_model=False) # step_attention_truth = self._get_attention_truth(step_teacher, is_step_model=True) # if hparams.get('use_fixed_attention'): # step_hparams['_attention_truth'] = step_attention_truth # train_hparams['_attention_truth'] = train_attention_truth # if hparams.get('do_joint_training'): # step_hparams['_teacher_h3'] = step_teacher.conv3 # step_hparams['_teacher_h'] = step_teacher.embedding # train_hparams['_teacher_h3'] = train_teacher.conv3 # train_hparams['_teacher_h'] = train_teacher.embedding # if hparams.get('use_target_model'): # assert not hparams.get('do_joint_training') # target_hparams = copy.copy(train_hparams) # target_hparams['_policy_scope'] = 'target_model' # target_hparams['_src_scope'] = 'model' # target_model = policy(sess, X_step, ob_space, ac_space, nenvs, 1, reuse=False, hparams=target_hparams) # target_model.setup_copy_weights() # self.target_model = target_model scaled_images = tf.cast(train_model.X, tf.float32) / 255. print('scaled_images shape: {}'.format(scaled_images)) sfm_base = object_segmentation.ObjectSegmentationBase( frames=scaled_images, embedding=train_model.h) sfm_hparams = copy.deepcopy(hparams) sfm_hparams['batch_size'] = nenvs * nsteps tf.summary.image('frame0', tf.expand_dims(train_model.X[..., -2], axis=-1), max_outputs=1) tf.summary.image('frame1', tf.expand_dims(train_model.X[..., -1], axis=-1), max_outputs=1) # Create the loss function. def a2c_loss(pi, vf): neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) # ent_coef_mode = hparams.get('ent_coef_mode', 'default') # ent_coef_val = hparams.get('ent_coef_val', ent_coef) # if ent_coef_mode == 'default': # actual_ent_coef = ent_coef_val # elif ent_coef_mode == 'linear_teacher': # actual_ent_coef = ent_coef_val * TEACHER_C + ent_coef * (1 - TEACHER_C) # elif ent_coef_mode == 'additive_teacher': # actual_ent_coef = ent_coef_val + ent_coef_val * TEACHER_C # else: # raise Exception('unrecognized ent_coef_mode: {}'.format(ent_coef_mode)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef return loss, pg_loss, vf_loss, entropy loss, pg_loss, vf_loss, entropy = a2c_loss(train_model.pi, train_model.vf) # if hparams.get('dropout_data_aug_c'): # logged_augs = False # loss_c = 1.0 - hparams['num_dropout_models'] * hparams['dropout_data_aug_c'] # assert loss_c >= hparams['dropout_data_aug_c'] - 1e-5 # loss = loss_c * loss # for pi_noise, vf_noise in zip(train_model.pi_noises, train_model.vf_noises): # l2, pg2, vf2, entropy2 = a2c_loss(pi_noise, vf_noise) # loss += l2 * hparams['dropout_data_aug_c'] # if not logged_augs: # logged_augs = True # tf.summary.scalar('aug_loss', tf.reduce_mean(l2)) # tf.summary.scalar('aug_pgloss', tf.reduce_mean(pg2)) # tf.summary.scalar('aug_vfloss', tf.reduce_mean(vf2)) # tf.summary.scalar('aug_entropyloss', tf.reduce_mean(entropy2)) # print("ADDING DROPOUT DATA AUG") # if hasattr(train_model, 'noise_loss') and hparams.get('noise_loss_c'): # loss += train_model.noise_loss # print("ADDING NOISE LOSS") # tf.summary.image('frame0', tf.expand_dims(train_model.X[..., -2],-1), max_outputs=1) # tf.summary.image('frame1', tf.expand_dims(train_model.X[..., -1],-1), max_outputs=1) teacher_loss = 0.0 if hparams.get('teacher_ckpt') and hparams.get( 'learn_attention_from_teacher'): assert hparams.get('attention_20') or hparams.get( 'inverted_attention_20') # Load in the teacher. # teacher = sfmnet.SfmNet(hparams=sfm_hparams, sfm_base=sfm_base, is_teacher_network=True) # attention_loss = tf.nn.softmax_cross_entropy_with_logits( # labels=train_attention_truth, # logits=tf.reshape(train_model.attention_logits, [nbatch,-1]) # ) # print('attention_loss: {}'.format(attention_loss.get_shape())) # print('train_attention_mask: {}'.format(train_attention_mask.get_shape())) # attention_loss = attention_loss * train_attention_mask # attention_loss = tf.reduce_mean(attention_loss) # # for t in [5., 10., 20., 40., 75., 100., 200., 500., 1000.]: # # truth = tf.nn.softmax(coarse_masks / t) # # tf.summary.image('attention_truth_{}'.format(t), tf.reshape(truth, [nbatch, 7, 7, 1]), max_outputs=1) # tf.summary.scalar('attention_loss', attention_loss) # tf.summary.scalar('attention_teaching', tf.reduce_mean(train_attention_mask)) # teacher_loss = TEACHER_C * attention_loss tf.summary.scalar('teacher_c', TEACHER_C) truth, mask = self._get_attention_truth_20(train_teacher, is_step_model=False) tf.summary.image('attention_20_truth', tf.reshape(truth, [80, 20, 20, 1]), max_outputs=1) if hparams.get('attention_20'): attention_loss_20 = tf.nn.softmax_cross_entropy_with_logits( labels=truth, logits=tf.reshape(train_model.attention_logits_20, [-1, 400])) attention_loss_20 = tf.reduce_mean(attention_loss_20 * mask) tf.summary.scalar('attention_loss_20', attention_loss_20) tf.summary.scalar('attention_teaching_20', tf.reduce_mean(mask)) teacher_loss += TEACHER_C * attention_loss_20 if hparams.get('extrapath_attention_20'): print("EXTRAPATH ATTENTION!!!") attention_loss_20 = tf.nn.softmax_cross_entropy_with_logits( labels=truth, logits=tf.reshape( train_model.extrapath_attention_logits_20, [-1, 400])) attention_loss_20 = tf.reduce_mean(attention_loss_20 * mask) tf.summary.scalar('attention_loss_20', attention_loss_20) tf.summary.scalar('attention_teaching_20', tf.reduce_mean(mask)) teacher_loss += (-TEACHER_C) * attention_loss_20 # if hparams.get('learn_attention_from_pg'): # attention_logits = tf.reshape(train_model.attention_logits, [nbatch, 49]) # attention_actions = sample(attention_logits) # attention_actions = tf.stop_gradient(attention_actions) # attention_neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=attention_logits, labels=attention_actions) # attention_pg_loss = tf.reduce_mean(ADV * attention_neglogpac) # tf.summary.scalar('attention_pg_loss', attention_pg_loss) # loss += attention_pg_loss * hparams['learn_attention_from_pg'] # if hparams.get('teacher_ckpt') and hparams.get('learn_translation_from_teacher'): # with tf.variable_scope("model"): # with tf.variable_scope('object_translation'): # pred_translation = fc(train_model.h, 'obj_t', nh=2*self.hparams['k_obj'], init_scale=1.0) # pred_translation = tf.reshape(pred_translation, (-1, self.hparams['k_obj'], 2)) # teacher_translation = tf.stop_gradient(train_teacher.object_translation) # translation_loss = mse(pred_translation, teacher_translation) # translation_loss = tf.reduce_mean(translation_loss) # teacher_loss += TEACHER_C * translation_loss # tf.summary.scalar('translation_loss', translation_loss) if hparams['do_joint_training']: teacher_loss += tf.reduce_mean( train_teacher.transform_loss + train_teacher.mask_reg_loss) * TEACHER_C if hasattr(train_model, 'attention_logits_20'): # Want a low entropy distribution, so that we are focused on only a small part of the image per frame. reshaped_logits = tf.reshape(train_model.attention_logits_20, [-1, 400]) attention_entropy = tf.reduce_mean(cat_entropy(reshaped_logits)) teacher_loss -= hparams[ 'attention_entropy_c'] * attention_entropy * TEACHER_C tf.summary.scalar('attention_entropy', attention_entropy) if hasattr(train_model, 'extrapath_attention_logits_20'): # Want a low entropy distribution, so that we are focused on only a small part of the image per frame. reshaped_logits = tf.reshape( train_model.extrapath_attention_logits_20, [-1, 400]) attention_entropy = tf.reduce_mean(cat_entropy(reshaped_logits)) teacher_loss -= hparams[ 'attention_entropy_c'] * attention_entropy * TEACHER_C tf.summary.scalar('extrapath_attention_entropy', attention_entropy) # if hasattr(train_model, 'attention_weights_20'): # # Want this to be high entropy, so we are looking at different parts of the image on different images. # batch_logits = tf.reshape(tf.reduce_sum(train_model.attention_weights_20, axis=0), [1, 400]) # attention_entropy = tf.reduce_mean(cat_entropy_softmax(batch_logits)) # loss -= hparams['batch_entropy_c'] * attention_entropy # tf.summary.scalar('batch_entropy', attention_entropy) # if hparams['do_joint_training'] and False: # assert hparams.get('teacher_ckpt') # teacher_loss += TEACHER_C * train_teacher.total_loss # else: # sfm_loss = None # if hparams['do_flow_prediction']: # assert hparams.get('teacher_ckpt') # flow_truth_x, flow_truth_y = self._get_flow_truth(train_teacher) # predicted_flow = conv(train_model.flow_base, 'pred_flow', nf=4, rf=1, stride=1, trainable=True) # flow_pred_x = tf.reshape(predicted_flow[..., :2], [-1, 2]) # flow_pred_y = tf.reshape(predicted_flow[..., 2:], [-1, 2]) # flow_x_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=flow_truth_x, logits=flow_pred_x)) # flow_y_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=flow_truth_y, logits=flow_pred_y)) # flow_loss = flow_x_loss + flow_y_loss # # flow_error = tf.reduce_mean(mse(flow_truth, predicted_flow)) # teacher_loss += TEACHER_C * flow_loss * hparams['flow_error_c'] # flow_x_acc = tf.reduce_mean(tf.cast(tf.argmax(flow_pred_x, axis=-1) == flow_truth_x, tf.int32)) # flow_y_acc = tf.reduce_mean(tf.cast(tf.argmax(flow_pred_y, axis=-1) == flow_truth_y, tf.int32)) # # tf.summary.scalar('flow_error_if_predict_zeros', tf.reduce_mean(0.5 * tf.square(flow_truth))) # tf.summary.scalar('flow_x_loss', flow_x_loss) # tf.summary.scalar('flow_y_loss', flow_y_loss) # tf.summary.scalar('flow_x_acc', flow_x_acc) # tf.summary.scalar('flow_y_acc', flow_y_acc) # # tf.summary.image('predicted_flow_x', tf.expand_dims(predicted_flow[..., 0], axis=-1), max_outputs=1) # # tf.summary.image('predicted_flow_y', tf.expand_dims(predicted_flow[..., 1], axis=-1), max_outputs=1) self.train_writer = tf.summary.FileWriter( os.path.join(hparams['base_dir'], 'logs', hparams['experiment_name']), sess.graph) # TODO(vikgoel): when we don't need the teacher, we should ensure that we don't merge its summaries so that way # we don't need to execute that part of the graph. merged_summaries = tf.summary.merge_all() trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) def get_train_op(loss_op): params = find_trainable_variables("model") # Switch from GATE_NONE to GATE_GRAPH to enhance reproducibility. #grads = tf.gradients(loss, params) grads_and_params = trainer.compute_gradients( loss=loss_op, var_list=params, gate_gradients=tf.train.RMSPropOptimizer.GATE_GRAPH) grads = [x[0] for x in grads_and_params] params = [x[1] for x in grads_and_params] if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) return trainer.apply_gradients(grads) _fast_train = get_train_op(loss) _teacher_train = get_train_op(loss + teacher_loss) params = find_trainable_variables("model") print('*' * 20) print('chosen trainable variables') for p in params: print(p.name) print('*' * 20) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) self.lr = lr write_counter = 0 def train(obs, states, rewards, masks, actions, values): nonlocal write_counter if lr.n % hparams['target_model_update_frequency'] == 0 and hasattr( self, 'target_model'): print('COPYING WEIGHTS INTO TARGET MODEL') self.target_model.copy_weights() advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() # Smooth approximation: #teacher_decay_c = hparams['teacher_decay_c']#9.9e-6 # 2.5e-5 #teacher_c = 1.0 / (teacher_decay_c * lr.n + 1) #teacher_c = min(hparams['max_teacher_c'], teacher_c) if not hparams['use_extra_path']: lerp = float(lr.n) / 1e7 lerp = min(lerp, 1) teacher_c = hparams['max_teacher_c'] * (1. - lerp) else: teacher_c = 1 # Linear decay schedule # teacher_c = (hparams['teacher_cutoff_step'] - lr.n) / hparams['teacher_cutoff_step'] # teacher_c = max(teacher_c, 0) # # Lower bound on the decay # teacher_c = (1 - hparams['teacher_loss_c']) * teacher_c + hparams['teacher_loss_c'] _train = _fast_train if teacher_c == 0 else _teacher_train td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr, TEACHER_C: teacher_c } # td_map[DROPOUT_STRENGTH] = get_dropout_strength(hparams, lr.n) if self.hparams['teacher_ckpt'] and self.hparams[ 'do_joint_training']: td_map[train_teacher.mask_reg_c] = 1 #if states is not None: # td_map[train_model.S] = states # td_map[train_model.M] = masks ops = [pg_loss, vf_loss, entropy, _train] # if hparams.get('no_train_a2c'): # ops = ops[:-1] if 'attention' in hparams['policy']: ops.append(train_model.attention_weights_20) write_summaries = hparams.get( 'teacher_ckpt') or 'attention' in hparams['policy'] if write_summaries: if write_counter % 10 != 0: write_summaries = False write_counter += 1 if write_summaries: ops.append(merged_summaries) sess_results = sess.run(ops, td_map) policy_loss = sess_results[0] value_loss = sess_results[1] policy_entropy = sess_results[2] if write_summaries: summary = sess_results[-1] self.train_writer.add_summary(summary, lr.n) if 'attention' in hparams['policy']: attention_output = sess_results[-2 if write_summaries else -1] publish_attention_weights(attention_output[:5, ...]) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load # Initialize all of the variables in a deterministic order so that each experiment is reproducible. global_vars = tf.global_variables() global_vars = sorted(global_vars, key=lambda x: x.name) for var in global_vars: tf.variables_initializer([var]).run(session=sess) #tf.global_variables_initializer().run(session=sess) if hparams.get('teacher_ckpt'): # Load in the teacher AFTER doing the init so we don't overwrite the weights. restore_teacher_from_checkpoint(sess, hparams['teacher_ckpt'])
def __init__(self, policy, ob_space, ac_space, nenvs, master_ts = 1, worker_ts = 8, ent_coef=0.01, vf_coef=0.5, max_grad_norm=2.5, lr=7e-4, cell=256, ib_alpha=0.04, sv_M=32, algo='use_svib_uniform', alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs*master_ts*worker_ts # master what's mean? # A:action, ADV:advantage, R:reward, LR:Learning Rate A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, 1, cell=cell, M=sv_M, model='step_model', algo=algo) train_model = policy(sess, ob_space, ac_space, nbatch, master_ts, worker_ts, cell = cell, M=sv_M, model='train_model', algo=algo) print('model_setting_done, algorithm:', str(algo)) ''' 可视化互信息,暂时跳过 ''' ib_loss = train_model.mi_xh_loss T = train_model.T_value t_grads, t_global_norm = grad_clip(-vf_coef*ib_loss, max_grad_norm, ['model/T/update_params']) t_trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _t_train = t_trainer.apply_gradients(t_grads) T_update_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/T/update_params') T_orig_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/T/orig_params') reset_update_params = [update_param.assign(orig_param) for update_param, orig_param in zip(T_update_params, T_orig_params)] # rpf_matrix, rpf_grads = rpf_kernel(vf_loss_sv, rpf_h) if algo == 'use_svib_uniform' or algo == 'use_svib_gaussian': def expand_placeholder(X, M=sv_M): return tf.tile(tf.expand_dims(X, axis=-1), [1, M]) A_expand, R_expand = expand_placeholder(A), expand_placeholder(R) neglogpac_expand = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi_expand, labels=A_expand)#shape=[nbatch, sv_M] # pg_loss_expand = tf.reduce_mean(ADV_expand * neglogpac_expand, axis=-1) pg_loss_expand = tf.reduce_mean(tf.stop_gradient(R_expand-train_model.wvf_expand[:,:,0]) * neglogpac_expand, axis=-1) vf_loss_expand = tf.reduce_mean(mse(tf.squeeze(train_model.wvf_expand), R_expand), axis=-1) entropy_expand = tf.reduce_mean(cat_entropy(train_model.wpi_expand), axis=-1)#shape=[nbatch] J_theta = -(pg_loss_expand + vf_coef*vf_loss_expand - ent_coef*entropy_expand) loss_expand = -J_theta / float(nbatch) pg_loss_expand_ = tf.reduce_mean(pg_loss_expand) vf_loss_expand_ = tf.reduce_mean(vf_loss_expand) entropy_expand_ = tf.reduce_mean(entropy_expand) loss_expand_ = -tf.reduce_mean(J_theta) print('ib_alpha: ', ib_alpha) log_p_grads = tf.gradients(J_theta/np.sqrt(ib_alpha), [train_model.wh_expand])[0]#shape=[nbatch, sv_M, cell] if algo == 'use_svib_gaussian': mean, var = tf.nn.moments(train_model.wh_expand, axes=1, keep_dims=True)#shape=[nbatch, 1,cell] gaussian_grad = -(train_model.wh_expand - mean)/(float(sv_M) * (var+1e-3)) log_p_grads += 5e-3*(tf_l2norm(log_p_grads, axis=-1, keep_dims=True)/tf_l2norm(gaussian_grad, axis=-1, keep_dims=True))*gaussian_grad sv_grads = tf.constant(0., tf.float32, shape=[nbatch, 0, cell]) for i in range(sv_M): sv_grad = tf.reduce_sum(train_model.rpf_matrix[:, :, i:i+1] * log_p_grads, axis=1) + np.sqrt(ib_alpha)*train_model.rpf_grads[:, i, :]#shape=[nbatch, cell] sv_grads = tf.concat([sv_grads, tf.expand_dims(sv_grad, axis=1)], axis=1) SV_GRADS = tf.placeholder(tf.float32, [nbatch, sv_M, cell]) repr_loss = tf.reduce_mean(SV_GRADS * train_model.wh_expand, axis=1)#shape=[nbatch,cell] repr_loss = -tf.reduce_mean(tf.reduce_sum(repr_loss, axis=-1))#max optimization problem to minimization problem # repr_loss = -tf.reduce_mean(repr_loss, axis=0) # sv_grad_ = tf.reduce_sum(train_model.rpf_matrix[:, :, 2:3] * log_p_grads, axis=1) + train_model.rpf_grads[:, 2, :] # exploit_term = tf.reduce_sum(train_model.rpf_matrix[:, :, 2:3] * log_p_grads, axis=1) # explore_term = train_model.rpf_grads[:, 2, :] grads_expand, global_norm_expand = grad_clip(loss_expand, max_grad_norm, ['model/worker_module']) trainer_expand = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train_expand = trainer_expand.apply_gradients(grads_expand) repr_grads, repr_global_norm = grad_clip(repr_loss, max_grad_norm, ['model/ordinary_encoder']) repr_trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _repr_train = repr_trainer.apply_gradients(repr_grads) elif algo == 'sv_a2c': def expand_placeholder(X, M=sv_M): return tf.tile(tf.expand_dims(X, axis=-1), [1, M]) A_expand, R_expand = expand_placeholder(A), expand_placeholder(R) # [40, 32] sigma = tf.constant(1e-5) neglogpac_expand = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi_expand, labels=A_expand) + sigma # [40, 32] pg_loss_expand = tf.reduce_mean(tf.stop_gradient(R_expand - train_model.wvf_expand[:, :, 0]) * neglogpac_expand, axis=-1) # [40, ] vf_loss_sv = tf.expand_dims(mse(tf.squeeze(train_model.wvf_expand), R_expand), axis=-1) # [40, 32, 1] vf_loss_expand = tf.reduce_mean(mse(tf.squeeze(train_model.wvf_expand), R_expand), axis=-1) # [40, ] entropy_expand = tf.reduce_mean(cat_entropy(train_model.wpi_expand), axis=-1) # shape=[nbatch] J_theta = pg_loss_expand + vf_coef * vf_loss_expand - ent_coef * entropy_expand # [40, ] # 为什么要除nbatch loss_expand = J_theta / float(nbatch) # [40, ] pg_loss_expand_ = tf.reduce_mean(pg_loss_expand) vf_loss_expand_ = tf.reduce_mean(vf_loss_expand) # [1] entropy_expand_ = tf.reduce_mean(entropy_expand) loss_expand_ = tf.reduce_mean(J_theta) print('ib_alpha: ', ib_alpha) # mean, var = tf.constant(0., tf.float32, [nbatch, 1, 1]), tf.constant(1, tf.float32, [nbatch, 1, 1]) mean, var = tf.nn.moments(vf_loss_sv, axes=1, keep_dims=True) # [40, 1, 1] # Problem1: guassian gradient cauculate problem log_p_grads = -(vf_loss_sv - mean) / (float(sv_M) * (var)) sv_grads = tf.constant(0., tf.float32, shape=[nbatch, 0, 1]) # [nbatch, m, 1] rpf_h = self.h_coef(vf_loss_sv, sv_M) rpf_matrix, rpf_grads = self.rpf_kernel(vf_loss_sv, rpf_h, sv_M) for i in range(sv_M): # sv_grad = tf.reduce_sum(train_model.rpf_matrix[:, :, i:i+1] * log_p_grads, axis=1) + sqrt(ib_alpha) * train_model.rpf_grads[:, i, :] #shape=[nbatch, cell] sv_grad = tf.reduce_sum(rpf_matrix[:, :, i:i + 1] * log_p_grads, axis=1) + rpf_grads[:, i, :] sv_grads = tf.concat([sv_grads, tf.expand_dims(sv_grad, axis=1)], axis=1) SV_GRADS = tf.placeholder(tf.float32, [nbatch, sv_M, 1]) sv_loss = tf.reduce_mean(SV_GRADS * vf_loss_sv, axis=1) loss_expand -= ib_alpha * (tf_l2norm(loss_expand, axis=-1, keep_dims=True)/tf_l2norm(sv_loss, axis=-1, keep_dims=True)) * sv_loss grads_expand, global_norm_expand = grad_clip(loss_expand, max_grad_norm, ['model/worker_module', 'model/ordinary_encoder']) trainer_expand = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train_expand = trainer_expand.apply_gradients(grads_expand) # sv_loss_grads, sv_global_norm = grad_clip(sv_loss, max_grad_norm, ['model/worker_module/comm', 'model/worker_module/w_value']) # sv_trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) # _sv_train = sv_trainer.apply_gradients(sv_loss_grads) elif algo == 'anchor': neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) entropy = tf.reduce_mean(cat_entropy(train_model.wpi)) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.wvf), R)) # anchor method param_list = [] for scope in ['model/worker_module']: List = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) print(len(List)) param_list += List param_value_layer1_w = param_list[0] param_value_layer1_b = param_list[1] param_policy_layer2_w = param_list[2] param_value_layer2_w = param_list[4] param_value_layer2_b = param_list[5] init_stddev = 5.0 # 7.0 init_stddev_2 = 0.18 / np.sqrt(cell) # normal scaling lambda_anchor = [0.000001,0.1] layer1_w_init = tf.random_normal(mean=0., stddev=init_stddev, shape=param_value_layer1_w.get_shape()) layer1_b_init = tf.random_normal(mean=0., stddev=init_stddev, shape=param_value_layer1_b.get_shape()) layer2_w_init = tf.random_normal(mean=0, stddev=init_stddev_2, shape=param_value_layer2_w.get_shape()) layer2_b_init = tf.random_normal(mean=0, stddev=init_stddev_2, shape=param_value_layer2_b.get_shape()) loss_anchor = lambda_anchor[0] / nbatch * tf.reduce_sum(tf.square(layer1_w_init - param_value_layer1_w)) loss_anchor += lambda_anchor[0] / nbatch * tf.reduce_sum(tf.square(layer1_b_init - param_value_layer1_b)) loss_anchor += lambda_anchor[1] / nbatch * tf.reduce_sum(tf.square(layer2_w_init - param_value_layer2_w)) loss_anchor += lambda_anchor[1] / nbatch * tf.reduce_sum(tf.square(layer2_b_init - param_value_layer2_b)) loss = pg_loss + vf_coef * vf_loss - ent_coef * entropy + loss_anchor grads, global_norm = grad_clip(loss, max_grad_norm, ['model/worker_module', 'model/ordinary_encoder']) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) else: # regular algorithm neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) entropy = tf.reduce_mean(cat_entropy(train_model.wpi)) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.wvf), R)) loss = pg_loss + vf_coef * vf_loss - ent_coef * entropy grads, global_norm = grad_clip(loss, max_grad_norm, ['model/worker_module', 'model/ordinary_encoder']) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) params = find_trainable_variables("model") lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(wobs, whs, states, rewards, masks, actions, values, noises): advs = rewards - values # adv_mu, adv_var = np.mean(advs), np.var(advs)+1e-3 # advs = (advs - adv_mu) / adv_var for step in range(len(whs)): cur_lr = lr.value() sv_td_map = {train_model.wX : wobs, train_model.istraining:True, A:actions, R:rewards, LR:cur_lr} # Sess Graph # writer = tf.summary.FileWriter('./', sess.graph) repr_td_map = {train_model.wX: wobs, train_model.istraining: True, A: actions, R: rewards, LR: cur_lr} rl_td_map = {train_model.wX : wobs, train_model.istraining: True, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: rl_td_map[train_model.wS] = states rl_td_map[train_model.wM] = masks repr_grad_norm = 0. # print(str(np.sum(whs-sess.run(train_model.wh, feed_dict={train_model.wX : wobs, train_model.istraining:True, train_model.noise:noises})))) if algo == 'use_svib_uniform' or algo == 'use_svib_gaussian': repr_td_map[train_model.noise_expand], repr_td_map[train_model.NOISE_KEEP] = sess.run(train_model.noise_expand), noises wh_expands, sv_gradients = sess.run([train_model.wh_expand, sv_grads], feed_dict=repr_td_map) rl_td_map[train_model.wh_expand] = wh_expands tloss, value_loss, policy_loss, policy_entropy, rl_grad_norm, _ = sess.run( [loss_expand_, vf_loss_expand_, pg_loss_expand_, entropy_expand_, global_norm_expand, _train_expand], feed_dict=rl_td_map ) repr_td_map[SV_GRADS] = sv_gradients # if algo == 'use_svib_gaussian': # gaussian_gradients, repr_grad_norm, __ =\ # sess.run([gaussian_grad, repr_global_norm, _repr_train], feed_dict=repr_td_map) # return tloss, value_loss, policy_loss, policy_entropy, rl_grad_norm, gaussian_gradients, repr_grad_norm # represnet_loss, SV_GRAD, EXPLOIT, LOG_P_GRADS, EXPLORE repr_grad_norm, represent_loss, __ = sess.run([repr_global_norm, repr_loss, _repr_train], feed_dict=repr_td_map) elif algo == 'anchor': rl_td_map[train_model.wX] = wobs tloss, value_loss, policy_loss, policy_entropy, rl_grad_norm, anchor_loss, _ = sess.run( [loss, vf_loss, pg_loss, entropy, global_norm, loss_anchor ,_train], feed_dict=rl_td_map ) represent_loss = 0. sv_loss_ = 0 elif algo == 'sv_a2c': sv_td_map[train_model.noise_expand], sv_td_map[train_model.NOISE_KEEP] = sess.run( train_model.noise_expand), noises wvf_expands, sv_gradients = sess.run([train_model.wvf_expand, sv_grads], feed_dict=sv_td_map) rl_td_map[train_model.wvf_expand] = wvf_expands rl_td_map[train_model.noise_expand], rl_td_map[train_model.NOISE_KEEP] = sess.run( train_model.noise_expand), noises rl_td_map[SV_GRADS] = sv_gradients tloss, value_loss, policy_loss, policy_entropy, rl_grad_norm, sv_loss_, _ = sess.run( [loss_expand_, vf_loss_expand_, pg_loss_expand_, entropy_expand_, global_norm_expand, sv_loss, _train_expand], feed_dict=rl_td_map ) sv_td_map[SV_GRADS] = sv_gradients anchor_loss = 0. represent_loss = 0. else: rl_td_map[train_model.wX], rl_td_map[train_model.noise] = wobs, noises#noise won't be used when algo is 'regular' tloss, value_loss, policy_loss, policy_entropy, rl_grad_norm, _ = sess.run( [loss, vf_loss, pg_loss, entropy, global_norm, _train], feed_dict=rl_td_map ) # repr_td_map[WH_GRADS] = wh_gradients # repr_grad_norm, __ = sess.run([ordin_repr_global_norm, _ordin_repr_train], feed_dict=repr_td_map) repr_grad_norm = 0. represent_loss = 0. anchor_loss = 0 sv_loss_ = 0 return tloss, value_loss, policy_loss, policy_entropy, rl_grad_norm, repr_grad_norm, represent_loss, anchor_loss, sv_loss_#SV_GRAD, EXPLOIT, LOG_P_GRADS, EXPLORE def train_mine(wobs, whs, steps=256, lr=7e-4): # whs_std = (whs-np.mean(whs,axis=0,keepdims=True))/(1e-8 + np.std(whs,axis=0,keepdims=True)) idx = np.arange(len(whs)) ___ = sess.run(reset_update_params) for i in range(int(steps)): np.random.shuffle(idx) mi, T_value, __ = sess.run([ib_loss, T, _t_train], feed_dict={train_model.wX: wobs[idx], train_model.wh: whs[idx], LR: lr, train_model.istraining: True}) logger.record_tabular('mutual_info_loss', float(mi)) logger.record_tabular('T_value', float(T_value)) logger.dump_tabular() def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_mine = train_mine self.train_model = train_model self.step_model = step_model self.get_wh = step_model.get_wh self.get_noise = step_model.get_noise self.value = step_model.wvalue self.step = step_model.step self.initial_state = step_model.w_initial_state self.save = save self.load = load self.sv_M = sv_M # self.rpf_h = rpf_h # self.rpf_matrix = rpf_matrix # self.rpf_grads = rpf_grads tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nenvs, inter_op_parallelism_threads=nenvs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) self.saver = tf.train.Saver(max_to_keep=1000) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(path, steps): make_path(path) self.saver.save(sess, path + 'model', global_step=steps) def load(path, steps): self.saver = tf.train.import_meta_graph(path + 'model' + '-' + str(steps) + '.meta') self.saver.restore(sess, tf.train.latest_checkpoint(path)) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs*nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs*nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) # Defines step_model function and train_model functions # Pass each model a copy of 'sess' print("Constructing model... STEP_MODEL & TRAIN_MODEL: constructing step_model policy | " + str(policy)) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) # train_model takes in the mini-batch produced by 5 step_models, NOTE: reuse = true train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) # var init: this neglogpac is still somewhat unknown, # looks like it does softmax over policy layer of training model neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) print("MAIN: neglocpac = sparse_softmax_cross_entropy_with_logits() inputs: ") print("MAIN: train_model_pi: " + str(train_model.pi)) print("MAIN: labels: " + str(A)) # var init: policy gradient loss determined by average of all advantage * neglogpac pg_loss = tf.reduce_mean(ADV * neglogpac) # value function loss is mse(tf.squeeze(train_model.vf), R) # ^ in english, mse(model value prediction, actual Reward) # mse == means squared error, defined in a2c/utils.py vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) # entropy of policy entropy = tf.reduce_mean(cat_entropy(train_model.pi)) # total loss calculation? # todo: is this the loss function definition??? check with a3c paper loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef # params gets trainable variables from model (weights of network?) params = find_trainable_variables("model") # computes gradients (change of weights, or direction of weights) using 'loss' and 'params' above # computes 'symbolic derivatives of sum 'loss' w.r.t 'params' # from tflow docs: 'gradients() adds ops to the graph to output the derivs of 'params' grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) # TODO: how many gradients are computed here, should be 16 grads = list(zip(grads, params)) # RMSProp optimizes learning rate , check thesis notes trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) # RMSProp pushes back new gradients over trainable variables to change weights _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) writer = tf.summary.FileWriter("/tmp/helloTensorBoard.txt") writer.add_graph(sess.graph) # Trains the model, # TODO: What is 'masks' input param # TODO: How often does train_model (steps thru train_model) get run vs. step_model # A: I think it does a 'train_model' for each mini-batch, which is currently 5 steps # Does a sess.run with train_model def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() # td_map hooks up all inputs for train model? td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks # Policy Loss, Value Loss, and Policy Entropy calculations # Propagates losses backwards through the neural network? policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): path = logger.get_dir() + "/model.pkl" print("Logger dir: " + logger.get_dir()) print("MODEL SAVED TO : " + str(path)) ps = sess.run(params) #make_path(osp.dirname(save_path)) joblib.dump(ps, path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', nModelsToKeep=5): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(): modelfile = os.path.join( logger.get_dir(), datetime.datetime.now().strftime("model-%Y-%m-%d-%H-%M-%S-%f")) ps = sess.run(params) joblib.dump(ps, modelfile) logger.log('Model saved to %s' % modelfile) model_files = sorted( fnmatch.filter(os.listdir(logger.get_dir()), "model-*")) if len(model_files) > nModelsToKeep: for old_file in model_files[0:-nModelsToKeep]: os.remove(os.path.join(logger.get_dir(), old_file)) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) logger.log('Model loaded from %s' % load_path) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True) logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) self.logits = logits = train_model.pi ##training loss pg_loss = tf.reduce_mean(ADV*logpac) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) pg_loss = pg_loss - ent_coef * entropy vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) train_loss = pg_loss + vf_coef * vf_loss ##Fisher loss construction self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac) sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf)) self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2)) self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss self.params=params = find_trainable_variables("model") self.grads_check = grads = tf.gradients(train_loss,params) with tf.device('/gpu:0'): self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\ momentum=0.9, kfac_update=1, epsilon=0.01,\ stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm) update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params) train_op, q_runner = optim.apply_gradients(list(zip(grads,params))) self.q_runner = q_runner self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, train_op], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=num_procs, inter_op_parallelism_threads=num_procs) config.gpu_options.allow_growth = True sess = tf.Session(config=config) nact = ac_space.n nbatch = nenvs * nsteps writter = tf.summary.FileWriter( "/tmp/a2c_demo/1") # Change for SAT: this is to use tensorBoard A = tf.placeholder( tf.int32, [nbatch]) # Comments by Fei: this must be the action ADV = tf.placeholder( tf.float32, [nbatch]) # Comments by Fei: this must be the advantage R = tf.placeholder( tf.float32, [nbatch]) # Comments by Fei: this must be the reward LR = tf.placeholder( tf.float32, []) # Comments by Fei: this must be the learning rate step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) # Comments by Fei: pi is nbatch * nact pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) # writter.add_graph(sess.graph) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, master_ts = 1, worker_ts = 30, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, cell = 256, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', algo='regular', beta=1e-3): print('Create Session') gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) nact = ac_space.n nbatch = nenvs*master_ts*worker_ts A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, 1, cell = cell, model='step_model', algo=algo) train_model = policy(sess, ob_space, ac_space, nbatch, master_ts, worker_ts, model='train_model', algo=algo) print('model_setting_done') #loss construction neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.wvf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.wpi)) pg_loss = pg_loss - entropy * ent_coef print('algo: ', algo, 'max_grad_norm: ', str(max_grad_norm)) try: if algo == 'regular': loss = pg_loss + vf_coef * vf_loss elif algo == 'VIB': ''' implement VIB here, apart from the vf_loss and pg_loss, there should be a third loss, the kl_loss = ds.kl_divergence(model.encoding, prior), where prior is a Gaussian distribution with mu=0, std=1 the final loss should be pg_loss + vf_coef * vf_loss + beta*kl_loss ''' prior = ds.Normal(0.0, 1.0) kl_loss = tf.reduce_mean(ds.kl_divergence(train_model.encoding, prior)) loss = pg_loss + vf_coef * vf_loss + beta*kl_loss # pass else: raise Exception('Algorithm not exists') except Exception as e: print(e) grads, global_norm = grad_clip(loss, max_grad_norm, ['model']) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(wobs, whs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(whs)): cur_lr = lr.value() td_map = {train_model.wX:wobs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.wS] = states td_map[train_model.wM] = masks ''' you can add and run additional loss for VIB here for debugging, such as kl_loss ''' tloss, value_loss, policy_loss, policy_entropy, _ = sess.run( [loss, vf_loss, pg_loss, entropy, _train], feed_dict=td_map ) return tloss, value_loss, policy_loss, policy_entropy params = find_trainable_variables("model") def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.wvalue self.get_wh = step_model.get_wh self.initial_state = step_model.w_initial_state self.train = train self.save = save self.load = load tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nscripts=16, nsteps=20, nstack=4, ent_coef=0.1, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.001, kfac_clip=0.001, lrschedule='linear', alpha=0.99, epsilon=1e-5): config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nsml.bind(sess=sess) #nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) XY0 = tf.placeholder(tf.int32, [nbatch]) XY1 = tf.placeholder(tf.int32, [nbatch]) # ADV == TD_TARGET - values ADV = tf.placeholder(tf.float32, [nbatch]) TD_TARGET = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy( sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model2 = train_model = policy( sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) # Policy 1 : Base Action : train_model.pi label = A script_mask = tf.concat( [ tf.zeros([nscripts * nsteps, 1]), tf.ones([(nprocs - nscripts) * nsteps, 1]) ], axis=0) pi = train_model.pi pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0 pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi, labels=A) neglogpac *= tf.stop_gradient(pac_weight) inv_A = 1.0 - tf.cast(A, tf.float32) xy0_mask = tf.cast(A, tf.float32) xy1_mask = tf.cast(A, tf.float32) condition0 = tf.equal(xy0_mask, 2) xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask) xy0_mask = 1.0 - xy0_mask condition1 = tf.equal(xy1_mask, 2) xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask) # One hot representation of chosen marine. # [batch_size, 2] pi_xy0 = train_model.pi_xy0 pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0 pac_weight = tf.reduce_sum( pac_weight * tf.one_hot(XY0, depth=1024), axis=1) logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy0, labels=XY0) logpac_xy0 *= tf.stop_gradient(pac_weight) logpac_xy0 *= tf.cast(xy0_mask, tf.float32) pi_xy1 = train_model.pi_xy1 pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0 pac_weight = tf.reduce_sum( pac_weight * tf.one_hot(XY0, depth=1024), axis=1) # 1D? 2D? logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy1, labels=XY1) logpac_xy1 *= tf.stop_gradient(pac_weight) logpac_xy1 *= tf.cast(xy1_mask, tf.float32) pg_loss = tf.reduce_mean(ADV * neglogpac) pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0) pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1) vf_ = tf.squeeze(train_model.vf) vf_r = tf.concat( [ tf.ones([nscripts * nsteps, 1]), tf.zeros([(nprocs - nscripts) * nsteps, 1]) ], axis=0) * TD_TARGET vf_masked = vf_ * script_mask + vf_r #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps] vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET)) entropy_a = tf.reduce_mean(cat_entropy(train_model.pi)) entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0)) entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1)) entropy = entropy_a + entropy_xy0 + entropy_xy1 loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) self.logits = logits = train_model.pi # xy0 self.params_common = params_common = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') self.params_xy0 = params_xy0 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy0 = grads_xy0 = tf.gradients( train_loss_xy0, params_xy0) if max_grad_norm is not None: grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm) grads_xy0 = list(zip(grads_xy0, params_xy0)) trainer_xy0 = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy0 = trainer_xy0.apply_gradients(grads_xy0) # xy1 self.params_xy1 = params_xy1 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy1 = grads_xy1 = tf.gradients( train_loss_xy1, params_xy1) if max_grad_norm is not None: grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm) grads_xy1 = list(zip(grads_xy1, params_xy1)) trainer_xy1 = tf.train.RMSPropOptimizer( learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy1 = trainer_xy1.apply_gradients(grads_xy1) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, td_targets, masks, actions, xy0, xy1, values): advs = td_targets - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, XY0: xy0, XY1: xy1, ADV: advs, TD_TARGET: td_targets, PG_LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _, \ policy_loss_xy0, policy_entropy_xy0, _, \ policy_loss_xy1, policy_entropy_xy1, _ = sess.run( [pg_loss, vf_loss, entropy, _train, pg_loss_xy0, entropy_xy0, _train_xy0, pg_loss_xy1, entropy_xy1, _train_xy1], td_map) return policy_loss, value_loss, policy_entropy, \ policy_loss_xy0, policy_entropy_xy0, \ policy_loss_xy1, policy_entropy_xy1 def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state print("global_variables_initializer start") tf.global_variables_initializer().run(session=sess) print("global_variables_initializer complete")
def __init__(self, policy, p, has_state): """ policy : Internal Policy model such as SnakeModel.CNNPolicy p : Hyperparameters required for training """ sess = tf_util.make_session() # Tensorflow model initiallization step_model = policy(sess=sess, p=p, train_phase=False, has_state=has_state) # Deploy model settings train_model = policy(sess=sess, p=p, train_phase=True, has_state=has_state) # Training model settings saver = tf.train.Saver() #Step 2 : Initialize the training parameters A = tf.placeholder(tf.int32, [p.N_BATCH]) ADV = tf.placeholder(tf.float32, [p.N_BATCH]) R = tf.placeholder(tf.float32, [p.N_BATCH]) LR = tf.placeholder(tf.float32, []) #Step 3 : Define the loss Function neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) # pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * p.ENTROPY_COEFF + vf_loss * p.VALUE_FUNC_COEFF #Step 4 : Define the loss optimizer params = find_trainable_variables("model") grads = tf.gradients(loss, params) if p.MAX_GRAD_NORM is not None: grads, grad_norm = tf.clip_by_global_norm( grads, p.MAX_GRAD_NORM ) # Clipping the gradients to protect learned weights grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=p.RMS_DECAY, epsilon=p.EPSILON) _train = trainer.apply_gradients( grads) # This is the variable which will be used lr = Scheduler(v=p.LEARNING_RATE, nvalues=p.N_TIMESTEPS, schedule=p.LEARNING_RATE_SCHEDULE ) # Learning rate changes linearly or as per arguments # Step 5 : Write down the summary parameters to be used writer = tf.summary.FileWriter(p.LOG_PATH) #summary writer def train(obs, rewards, masks, actions, values, states): """ obs : batch x n x m x 1 snake matrix rewards : batch x 1 rewards corrosponding to action actions : batch x 1 discrete action taken values : batch x 1 output of value function during the training process """ advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, train_model.S: states, A: actions, ADV: advs, R: rewards, LR: cur_lr } policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): #ps = sess.run(params) #make_path(save_path) #joblib.dump(ps, save_path) saver.save(sess, save_path) def load(load_path): #loaded_params = joblib.load(load_path) #restores = [] #for p, loaded_p in zip(params, loaded_params): # restores.append(p.assign(loaded_p)) #ps = sess.run(restores) saver.restore(sess, load_path) def add_scalar_summary(tag, value, step): summary = tf.Summary( value=[tf.Summary.Value(tag=tag, simple_value=value)]) writer.add_summary(summary, step) # Expose the user to closure functions self.train = train self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.hidden_value = step_model.hidden_value self.initial_state = step_model.initial_state self.add_scalar_summary = add_scalar_summary self.save = save self.load = load # Initialize global variables and add tf graph tf.global_variables_initializer().run(session=sess) writer.add_graph(tf.get_default_graph()) #write graph
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', param=None): sess = tf_util.make_session() nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False, param=param) train_model = policy(sess, ob_space, ac_space, nenvs * nsteps, nsteps, reuse=True, param=param) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=train_model.pi, labels=A) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R)) entropy = tf.reduce_mean(cat_entropy(train_model.pi)) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) ps = sess.run(restores) self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess)