def loss(self, x, t, u_hat): """ Returns mse sums catching properties of 1d heat eq. """ #Parital derivatives u = u_hat(x, t) u_x = tf.gradients(u, x)[0] u_xx = tf.gradients(u_x, x)[0] u_t = tf.gradients(u, t)[0] #Initial and boundary conditions n = tf.size(x) zeros, ones = tf.zeros([n, 1], dtype=tf.float64), tf.ones([n, 1], dtype=tf.float64) input_values = tf.cast(tf.reshape(tf.linspace(0.0, 1.0, n), [-1, 1]), dtype=tf.float64) u_t0 = u_hat(input_values, zeros) #t=0 u_x0 = u_hat(zeros, input_values) #x=0 u_x1 = u_hat(ones, input_values) #x=1 return (mean_squared_error(u_t, u_xx) + mean_squared_error(u_t0, tf.sin(np.pi * input_values)) + mean_squared_error(u_x0, zeros) + mean_squared_error(u_x1, zeros))
def eigen_ODE_loss(x0, t, x_hat, l=n * m, seed=seed): """ Custom loss function to be used in HeatLearner class. ( HeatLearner(custom_loss=eigen_ODE_loss) ) Uses a loop to compute f(x(t)) because of shape requirments in f transform. x(t) output is shape (n*m, 1). x(t=t_m) is shape (n, 1). """ np.random.seed(seed) A = np.random.normal(0, 1, (n, n)) A = (A.T + A) / 2 A = tf.convert_to_tensor(A, dtype=tf.float64) #(n, n) symmetrical matrix x = x_hat(x0, t) #First forward pass/prediction #Loops over each (n, 1) at each time m in (n*m, 1) Fx = [] for i in range(int(l / n)): #l/n = m x_vec = x[n * i:n * (i + 1), 0] #x vector at time dictated by rows x_vec = tf.reshape(x_vec, [-1, 1]) #Shape (n, 1) #f(x(t)) xT = tf.transpose(x_vec) m1 = tf.matmul(xT, x_vec) * A m2 = (1 - tf.matmul(xT, tf.matmul(A, x_vec))) fx = tf.matmul(m1 + m2, x_vec) Fx.append(fx) fx = tf.reshape(Fx, [-1, 1]) #Reshape to (n*m, 1) x_t = tf.gradients(x_hat(x0, t), t)[0] #Gradient (n*m, 1) return mean_squared_error(x_t, fx - x)
def model_fn(features, labels, mode): tf.logging.set_verbosity(tf.logging.WARN) model = hub.Module(IMG_ENCODER, trainable=True) tf.logging.set_verbosity(tf.logging.INFO) model = model(features['x']) regularizer = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) output = tf.layers.dense(model, VEC_SPACE_DIMENSIONS, activation=tf.nn.relu) output = tf.layers.dense(model, VEC_SPACE_DIMENSIONS, activation=tf.nn.relu) output = tf.layers.dense(model, VEC_SPACE_DIMENSIONS, activation=tf.nn.tanh) if mode == ModeKeys.TRAIN or mode == ModeKeys.EVAL: loss = mean_squared_error(labels, output) regularizer = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) loss = loss + 0.25 * sum(regularizer) if mode == ModeKeys.TRAIN: train_op = AdamOptimizer(learning_rate=0.00001).minimize( loss=loss, global_step=get_global_step()) return EstimatorSpec(mode=mode, loss=loss, train_op=train_op) elif mode == ModeKeys.EVAL: eval_metric_ops = { 'accuracy': tf.metrics.mean_cosine_distance(labels, output, 0) } return EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops) elif mode == ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=output)
train_ds = Dataset.from_tensor_slices( (x_train, y_train)).shuffle(n_train_samples).batch(batch_size).repeat() test_ds = Dataset.from_tensor_slices((x_test, y_test)) for b, item in enumerate(train_ds): print(b, item) if b + 1 == math.ceil(n_train_samples / batch_size): break def create_model(): model = Sequential() model.add(Dense(4, activation='tanh', input_dim=1)) model.add(Dense(1)) return model eager_model = create_model() optimizer = GradientDescentOptimizer(0.1) for e in range(n_epochs): for b, (x, y) in enumerate(train_ds): with tf.GradientTape() as tape: pred = eager_model.predict(x.numpy()) loss_value = mean_squared_error(pred, y.numpy().reshape(10, 1)) grads = tape.gradient(loss_value, eager_model.variables) optimizer.apply_gradients([grads.eager_model.variables]) print(loss_value) print(e)
def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', network='cnn', prio_args=None): self.prio_args = prio_args sess = tf_util.get_session() nenvs = self.get_active_envs(env) nbatch = nenvs * nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) # our TD evaluating network A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Value loss vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) # TD loss # td_loss = losses.mean_squared_error(tf.squeeze(train_model.dt), TD) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef """prio model""" with tf.variable_scope('a2c_model_prio', reuse=tf.AUTO_REUSE): # prio_model = policy(nbatch, nsteps, sess) prio_model = MyNN(env, nbatch, network) P_R = tf.placeholder(tf.float32, [nbatch]) PRIO = tf.placeholder(tf.float32, [nbatch]) P_LR = tf.placeholder(tf.float32, []) # prio_model_loss = losses.mean_squared_error(tf.squeeze(prio_model.out), P_R) # Reward prio_model_loss = losses.mean_squared_error(tf.squeeze(prio_model.out), PRIO) # TD Error # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") params_prio = find_trainable_variables("a2c_model_prio") # 2. Calculate the gradients grads = tf.gradients(loss, params) prio_grads = tf.gradients(prio_model_loss, params_prio) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) prio_grads, prio_grad_norm = tf.clip_by_global_norm( prio_grads, max_grad_norm) grads = list(zip(grads, params)) prio_grads = list(zip(prio_grads, params_prio)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) prio_trainer = tf.train.RMSPropOptimizer(learning_rate=P_LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) _prio_train = prio_trainer.apply_gradients(prio_grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) prio_loss = 0 if self.prio_args is not None: prio_values = GetValuesForPrio(self.prio_args['prio_type'], self.prio_args['prio_param'], advs, rewards) prio_td_map = { prio_model.X: obs, P_R: rewards, P_LR: cur_lr, PRIO: prio_values } prio_loss, _, p_td = sess.run( [prio_model_loss, _prio_train, PRIO], prio_td_map) # mb aranged as 1D-vector = [[env_1: n1, ..., n_nstep],...,[env_n_active]] # need to take last value of each env's buffer self.prio_score = prio_values[list( filter(lambda x: x % nsteps == (nsteps - 1), range(len(prio_values))))] return policy_loss, value_loss, policy_entropy, prio_loss self.train = train self.train_model = train_model self.step_model = step_model self.prio_model = prio_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, r0_coef=0.05, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', head=-1, step_placeholder=None, train_placeholder=None, encoded_x_1=None, encoded_x_2=None): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs * nsteps self.step_placeholder = step_placeholder self.train_placeholder = train_placeholder self.encoded_x_1 = encoded_x_1 self.encoded_x_2 = encoded_x_2 with tf.variable_scope('aux_model' + str(head), reuse=tf.AUTO_REUSE): step_model, self.step_placeholder, self.encoded_x_1 = policy( nenvs, 1, sess, observ_placeholder=self.step_placeholder, encoded_x=self.encoded_x_1) train_model, self.train_placeholder, self.encoded_x_2 = policy( nbatch, nsteps, sess, observ_placeholder=self.train_placeholder, encoded_x=self.encoded_x_2) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = tf.reduce_mean(ADV * neglogpac) print(train_model.vf) print(R) vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) #r0_loss = losses.mean_squared_error(tf.squeeze(train_model.r), R) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables('aux_model' + str(head)) grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) #print("gradiants to update: ", grads) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) with tf.name_scope('summaries'): a_r = tf.summary.scalar('avg_reward', tf.reduce_mean(R)) #a_p_l = tf.summary.scalar('avg_pg_loss', tf.reduce_mean(pg_loss)) #a_v_l = tf.summary.scalar('avg_vf_loss', tf.reduce_mean(vf_loss)) #a_l = tf.summary.scalar('avg_loss', tf.reduce_mean(loss)) #merged = tf.summary.merge([a_r, a_p_l, a_v_l, a_l]) merged = tf.summary.merge([a_r]) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train( obs, states, rewards, masks, actions, values ): #Policy already sampled!! We need to update the critic now. advs = rewards - values #For a set of (s, a), we get (r0 - v0, r1 - v1, ...) #print("advs: ", advs) for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } #print(td_map) #we train out model with observed actions, advs, rewards, cur_lr #how is values and rewards calculated though? These cannot be sampled from a single state. if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, summary, _ = sess.run( [pg_loss, vf_loss, entropy, merged, _train], td_map) return policy_loss, value_loss, policy_entropy, summary self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) self.train_writer = tf.summary.FileWriter('logs/aux/' + str(head), sess.graph) tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, env, nsteps, icm,idf, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs*nsteps self.idf=idf print("This is Icm in Model Init function " , type(icm)) with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Value loss vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) if icm is not None : grads = grads + icm.pred_grads_and_vars # print("Gradients added ") # print("independetly there shape were a2c : {} icm :{} and together {} ".format(np.shape(grads),np.shape(icm.pred_grads_and_vars), # np.shape(grads_and_vars))) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values , next_obs ) : #, icm_rewards,cumulative_dicounted_icm): #, new_rew): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') # print(" icm called in train function ", type(icm)) advs = rewards - values # print("Now the advantage ", advs ) # icm_adv = icm_rewards - values # m , s = get_mean_and_std(icm_adv) # > adv Normaliztion # m , s = get_mean_and_std(advs) # advs = (advs - m) / (s + 1e-7) # advs = (icm_adv - m) / (s + 1e-7) # icm_adv = (icm_adv - icm_adv.mean()) / ( + 1e-7) # print("icm advantage ", icm_adv) # advs = new_rew - values # print("Advantage :", advs) # print("On train shapes are ") # print(" obs {} states {} rewards {} masks {} actions {} values {} ". # format(np.shape(obs) , np.shape(states) , np.shape(rewards) , np.shape(masks) ,np.shape(actions) , # np.shape(values) )) # print("Received Advantage {} rewards {} values {}".format( # advs , rewards , values) ) # print("advantage reward and values shape ") # print("advs {} , rewards shape {} , values {}".format(np.shape(advs) , np.shape(rewards) , np.shape(values))) for step in range(len(obs)): cur_lr = lr.value() if icm is None : td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} else : # print("curiosity Td Map ") # print(" obs {} , next obs {} , actions {} ".format(np.shape(obs) , np.shape(next_obs), # np.shape(actions))) td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr , icm.state_:obs, icm.next_state_ : next_obs , icm.action_ : actions }# , icm.R :rewards } if icm is None : if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy else : if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks if self.idf : policy_loss, value_loss, policy_entropy,forward_loss , inverse_loss , icm_loss, _ = sess.run( [pg_loss, vf_loss, entropy, icm.forw_loss , icm.inv_loss, icm.icm_loss ,_train], td_map) return policy_loss, value_loss, policy_entropy,forward_loss , inverse_loss , icm_loss, advs else : policy_loss, value_loss, policy_entropy,forward_loss , icm_loss, _ = sess.run( [pg_loss, vf_loss, entropy, icm.forw_loss , icm.icm_loss ,_train], td_map) return policy_loss, value_loss, policy_entropy,forward_loss , 0.0 , icm_loss, advs self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
# + from tensorflow.contrib.layers import xavier_initializer from tensorflow.losses import mean_squared_error from tensorflow.train import AdamOptimizer tf.reset_default_graph() X_data = tf.placeholder(tf.float32, shape=[None, x_vals.shape[1]]) y_target = tf.placeholder(tf.float32, shape=[None, 1]) W = tf.get_variable(shape=[x_vals.shape[1], 1], name="W", initializer=xavier_initializer()) b = tf.get_variable(shape=[1, 1], name="b", initializer=xavier_initializer()) output = tf.matmul(X_data, W) - b l2_norm = mean_squared_error(output, y_target) # - # $$ Loss = \max(0, 1 - \hat{y(i)} \cdot y(i)) + \alpha ||X \cdot W - b||^2 $$ loss = tf.reduce_mean(tf.maximum(0., 1. - output * y_target)) + 0.01 * l2_norm optimizer = AdamOptimizer(0.01).minimize(loss) # + batch_size = 1024 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(20000): rand_index = np.random.choice(len(X_train), size=batch_size) rand_x = X_train[rand_index]
def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs * nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): step_model = policy(nenvs, 1, sess) train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("a2c_model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def _mse(y_true: Tensor, y_pred: Tensor) -> Tensor: """均方误差损失(losses.mse()) :param y_true: shape = (N, In), float32 :param y_pred: shape = (N, In), float32 :return: shape = (N,), float32""" return tf.reduce_mean((y_true - y_pred)**2, axis=-1) tf.random.set_seed(0) y_true = tf.random.normal((16, 10)) y_pred = tf.random.normal((16, 10)) print(losses.mse(y_true, y_pred)) print(losses.mean_squared_error(y_true, y_pred)) # 答案同上 print(_mse(y_true, y_pred)) # tf.Tensor( # [1.1952267 3.4243941 2.1024227 2.3010921 2.3643446 1.8302895 1.6360563 # 3.5714912 2.3740485 4.2296114 1.4224513 4.019039 0.7188259 1.5340036 # 1.5875269 2.435854 ], shape=(16,), dtype=float32) # tf.Tensor( # [1.1952267 3.4243941 2.1024227 2.3010921 2.3643446 1.8302895 1.6360563 # 3.5714912 2.3740485 4.2296114 1.4224513 4.019039 0.7188259 1.5340036 # 1.5875269 2.435854 ], shape=(16,), dtype=float32) # tf.Tensor( # [1.1952267 3.4243941 2.1024227 2.301092 2.3643446 1.8302895 1.6360562 # 3.5714912 2.3740482 4.2296114 1.4224513 4.019039 0.7188258 1.5340036 # 1.587527 2.435854 ], shape=(16,), dtype=float32)
def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, diverse_r_coef=0.1, gamma=0.99, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs * nsteps with tf.variable_scope('vfo_model', reuse=tf.AUTO_REUSE): step_model = policy(nbatch=nenvs, nsteps=1, sess=sess) train_model = policy(nbatch=nbatch, nsteps=nsteps, sess=sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) params = find_trainable_variables('vfo_model') print(params) # ============================== # model-free actor-critic loss # ============================== with tf.variable_scope('mf_loss'): neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) pg_loss = tf.reduce_mean(ADV * neglogpac) vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) # ============================== # diverse options policy loss # ============================== option_train_ops = [] option_losses = [] option_losses_names = [] option_distil_train_op = None with tf.variable_scope('options_loss'): diversity_reward = -1 * tf.nn.softmax_cross_entropy_with_logits_v2( labels=train_model.op_z, logits=train_model.option_discriminator) diversity_reward = tf.check_numerics( diversity_reward, 'Check numerics (1): diversity_reward') diversity_reward -= tf.log( tf.reduce_sum(train_model.prior_op_z * train_model.op_z) + 1e-6) print('d_reward:', diversity_reward.get_shape().as_list()) intrinsic_reward = tf.multiply( train_model.next_pvfs - train_model.pvfs, train_model.op_z) intrinsic_reward = tf.reduce_sum(intrinsic_reward, 1) print('i_reward:', intrinsic_reward.get_shape().as_list()) reward = diverse_r_coef * diversity_reward + intrinsic_reward with tf.variable_scope('critic'): next_vf = tf.reduce_sum( tf.multiply(train_model.next_pvfs, train_model.op_z), 1) print('next_vf:', next_vf.get_shape().as_list()) option_q_y = tf.stop_gradient(reward + (1 - train_model.dones) * gamma * next_vf) option_q = tf.squeeze(train_model.option_q, 1) print('option_q_y:', option_q_y.get_shape().as_list()) print('option_q:', option_q.get_shape().as_list()) option_q_loss = 0.5 * tf.reduce_mean( (option_q_y - option_q)**2) with tf.variable_scope('actor'): log_op_pi_t = train_model.option_pd.logp(A) log_target_t = tf.squeeze(train_model.option_q, 1) pvf = tf.reduce_sum( tf.multiply(train_model.pvfs, train_model.op_z), 1) print('op_pi:', log_op_pi_t.get_shape().as_list()) print('op_t:', log_target_t.get_shape().as_list()) print('pvf:', pvf.get_shape().as_list()) kl_surrogate_loss = tf.reduce_mean( log_op_pi_t * tf.stop_gradient(log_op_pi_t - log_target_t - pvf)) with tf.variable_scope('discriminator'): print('op_z:', train_model.op_z.get_shape().as_list()) print('op_dis:', train_model.option_discriminator.get_shape().as_list()) discriminator_loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( labels=train_model.op_z, logits=train_model.option_discriminator_logits)) with tf.variable_scope('distillation'): # NOTE: to train distillation, op_z should be feed with q(z|s) print('mf_pi:', train_model.pi.get_shape().as_list()) print('op_pi:', train_model.option_pi.get_shape().as_list()) distillation_loss = losses.mean_squared_error( tf.stop_gradient(train_model.pi), train_model.option_pi) _train_option_q = tf.train.AdamOptimizer(lr).minimize( loss=option_q_loss, var_list=params) option_train_ops.append(_train_option_q) option_losses.append(option_q_loss) option_losses_names.append('option_critic') _train_option_policy = tf.train.AdamOptimizer(lr).minimize( loss=kl_surrogate_loss, var_list=params) option_train_ops.append(_train_option_policy) option_losses.append(kl_surrogate_loss) option_losses_names.append('option_actor') _train_option_disc = tf.train.AdamOptimizer(lr).minimize( loss=discriminator_loss, var_list=params) option_train_ops.append(_train_option_disc) option_losses.append(discriminator_loss) option_losses_names.append('option_discriminator') option_distil_train_op = tf.train.AdamOptimizer(lr).minimize( loss=distillation_loss, var_list=params) tf.summary.FileWriter(logger.get_dir(), sess.graph) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy def train_options(obs, next_obs, states, next_states, masks, next_masks, actions, actions_full, dones, options_z): feed = { train_model.X: obs, train_model.X_next: next_obs, A: actions, train_model.ac: actions_full, train_model.dones: dones, train_model.op_z: options_z } if states is not None: feed[train_model.S] = states feed[train_model.next_S] = next_states feed[train_model.M] = masks feed[train_model.next_M] = next_masks record_loss_values = [] for name, loss, train_op in zip(option_losses_names, option_losses, option_train_ops): loss_value, _ = sess.run([loss, train_op], feed) record_loss_values.append((name + '_loss', loss_value)) return record_loss_values def distill_mf_to_options(obs, states, masks): feed = {train_model.X: obs} if states is not None: feed[train_model.S] = states feed[train_model.M] = masks option_ensembles = sess.run(train_model.option_discriminator, feed) feed[train_model.op_z] = option_ensembles distillation_loss_value, _ = sess.run( [distillation_loss, option_distil_train_op], feed) return distillation_loss_value self.train = train self.train_options = train_options self.distill_mf_to_options = distill_mf_to_options self.train_model = train_model self.prior_op_z = train_model.prior_op_z self.step_model = step_model self.step = step_model.step self.option_step = step_model.option_step self.option_select = step_model.option_select self.selective_option_step = step_model.selective_option_step self.value = step_model.value self.proto_value = step_model.proto_value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def __init__( self, ob_size, # dimension of observation vector act_size, # dimension of action vector latents, # network hidden layer sizes learning_rate=1e-5, # learning rate activation='relu', # activation function optimizer='adam', # optimization function vf_coef=0.1, # vf_loss weight ent_coef=0.01, # ent_loss weight max_grad_norm=0.5): # how frequently the logs are printed out sess = tf_util.get_session() activation = tf_util.get_activation(activation) optimizer = tf_util.get_optimizer(optimizer) # learning_rate = tf.train.polynomial_decay( # learning_rate=learning_rate, # global_step=tf.train.get_or_create_global_step(), # decay_steps=total_epoches, # end_learning_rate=learning_rate / 10, # ) # placeholders for use X = tf.placeholder(tf.float32, [None, None, ob_size], 'observation') A = tf.placeholder(tf.int32, [None], 'action') ADV = tf.placeholder(tf.float32, [None], 'advantage') R = tf.placeholder(tf.float32, [None], 'reward') with tf.variable_scope('a2c'): policy = build_policy( observations=X, act_size=act_size, latents=latents, vf_latents=latents, activation=activation ) # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = policy.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(policy.entropy()) # Value loss vf_loss = losses.mean_squared_error(R, policy.vf) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # gradients and optimizer params = tf.trainable_variables('a2c') grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # 3. Make op for one policy and value update step of A2C trainer = optimizer(learning_rate=learning_rate) _train = trainer.apply_gradients(grads) # Add ops to save and restore all the variables. saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='a2c')) def step(obs): action, value = sess.run([policy.action, policy.vf], feed_dict={ X: obs }) return action, value def value(obs): return sess.run(policy.vf, feed_dict={ X: obs }) def debug_output(obs): """ This function is only for debug """ return sess.run([policy.logits, policy.latent, policy.vf_latent], feed_dict={ X: obs }) def train(obs, actions, rewards, values): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values td_map = {X:obs, A:actions, ADV:advs, R:rewards} policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy def save(save_path): saver.save(sess, save_path) print(f'Model saved to {save_path}') def load(load_path): saver.restore(sess, load_path) print(f'Model restored from {load_path}') self.train = train self.step = step self.value = value self.save = save self.load = load # for debug self.debug_output = debug_output tf.global_variables_initializer().run(session=sess)
def __init__( self, policy, env, nsteps, ent_coef=0.01 # 엔트로피계수 , vf_coef=0.5 # 가치계수 , max_grad_norm=0.5, lr=7e-4, alpha=0.99 # 벼림비 에누리 , epsilon=1e-5 # , total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs * nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model 은 표집을 위해 사용한다 step_model = policy(nenvs, 1, sess) # train_model 은 망을 벼림하기위해 사용한다 train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) # 손실(loss)을 계산한다 # Total loss = Policy gradient loss - entropy coefficient * entropy + Value coefficient * value loss # 총손실 = 정책 기울기손실 - 엔트로피계수 * 엔트로피 + 가치계수 * 가치손실 # 정책 손실(Policy loss) neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # 엔트로피는 부적절한(suboptimal) 정책으로 조기수렴하는 것을 제한하여 탐사를 개선하는데 사용한다. entropy = tf.reduce_mean(train_model.pd.entropy()) # 가치 손실(Value loss) vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # 손실(loss)을 사용하여 참여값(parameter: 가중값, 편향값)을 갱신한다 # 1. 모형의 참여값(parameter)을 가져온다 params = find_trainable_variables("a2c_model") # 2. 기울기(gradient)를 계산한다 grads = tf.gradients(loss, params) if max_grad_norm is not None: # 기울기를 제한한다: Clip the gradients ( normalize ) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) # zip 은 참여값(parameter)에 관련된 각각의 기울기를 합산(aggregate)한다. # 예를들어 zip(ABCD, xyza) => Ax, By, Cz, Da grads = list(zip(grads, params)) # 3. A2C 정책과 가치 갱신 한단계에 대한 동작을 만든다. trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): # 여기에서 강점을 계산한다: advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def _build_net(self, s, h, scope, trainable, ent_coef=0.01, vf_coef=0.5): # s is the state of the current market # h is the number of hand 0-11 with tf.variable_scope(scope): init_w = tf.random_normal_initializer(0., 0.3) init_b = tf.constant_initializer(0.1) net = tf.layers.dense(s, 30, activation=tf.nn.relu, kernel_initializer=init_w, bias_initializer=init_b, name='l1', trainable=trainable) with tf.variable_scope('tcn'): tcndropout = tf.placeholder_with_default(0., shape=()) value_map = build_tcn(s, tcndropout, kernel_size=3, num_channels=[256, 64, 32, 10]) if (modelDebug): print("value_map shape", value_map.shape) #value_map shape (?, 20, 10) with tf.variable_scope('vin'): v = value_map[:, -1, tf. newaxis, :] # get the values of the last time step vi_w = tf.get_variable('vi_w', [3, 1, 3], initializer=init_w, trainable=trainable) for i in range(-2, -5, -1): q = tf.pad(v, tf.constant([[0, 0], [0, 0], [1, 1]])) q = tfnn.conv1d(q, vi_w, 1, "VALID", data_format="NCW") #v: [?,1,1,12] vi_w:[1,3,1,3] if (modelDebug): print("q shape", q.shape) # q shape (?, 3, 10) v = tf.reduce_max(q, axis=1, keepdims=True, name="v%d" % i) v = v + value_map[:, i, tf.newaxis, :] # print(v.shape) with tf.variable_scope('a'): v = v[:, 0, :] # reshape v into rank2 paddings = tf.constant([[0, 0], [3, 3]]) v = tf.pad(v, paddings, "SYMMETRIC") h_pos = tf.one_hot(h, depth=10) # att_v = v[:,0,h:h+7]# the attentioned value function att_v = tf.concat([v, h_pos], 1) # concat the onehot position if (modelDebug): print("att_v", att_v.shape) #att_v (?, 26) action = tf.layers.dense(att_v, self.a_dim, kernel_initializer=init_w, bias_initializer=init_b, name='a', trainable=trainable) action = tf.nn.softmax(action) #action (?, 3) if (modelDebug): print("action", action.shape) value = tf.layers.dense(att_v, 1, kernel_initializer=init_w, bias_initializer=init_b, name="v", trainable=trainable) if (modelDebug): print("value :", value.shape) a = tf.argmax( action, axis=1 ) # the optimal action selected by algorithm for inference if (modelDebug): print("a:", a.shape) a_hot = tf.one_hot( A, depth=3 ) # the one_hot vector from A(place holder of explored action) for training prob = tf.reduce_sum(tf.multiply(action, a_hot), reduction_indices=[1]) eligibility = tf.log(prob) * (R - value) loss = -tf.reduce_sum(eligibility) entropy = tf.reduce_mean(tf.multiply( tf.log(action), action)) # the entropy term promotes exploration if (modelDebug): print(" tf.multiply( tf.log(action), action )", tf.multiply(tf.log(action), action).shape) print("entropy", entropy.shape) loss += entropy * ent_coef vf_loss = losses.mean_squared_error(value, R) loss -= vf_loss * vf_coef optimizer = tf.train.AdamOptimizer(0.01).minimize(loss) return a, optimizer, value
def __init__(self, policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=32, nscripts=16, nsteps=20, nstack=4, ent_coef=0.1, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.001, kfac_clip=0.001, lrschedule='linear', alpha=0.99, epsilon=1e-5): config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=nprocs, inter_op_parallelism_threads=nprocs) config.gpu_options.allow_growth = True self.sess = sess = tf.Session(config=config) nsml.bind(sess=sess) #nact = ac_space.n nbatch = nenvs * nsteps A = tf.placeholder(tf.int32, [nbatch]) XY0 = tf.placeholder(tf.int32, [nbatch]) XY1 = tf.placeholder(tf.int32, [nbatch]) # ADV == TD_TARGET - values ADV = tf.placeholder(tf.float32, [nbatch]) TD_TARGET = tf.placeholder(tf.float32, [nbatch]) PG_LR = tf.placeholder(tf.float32, []) VF_LR = tf.placeholder(tf.float32, []) self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False) self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True) # Policy 1 : Base Action : train_model.pi label = A script_mask = tf.concat([ tf.zeros([nscripts * nsteps, 1]), tf.ones([(nprocs - nscripts) * nsteps, 1]) ], axis=0) pi = train_model.pi pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0 pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1) neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi, labels=A) neglogpac *= tf.stop_gradient(pac_weight) inv_A = 1.0 - tf.cast(A, tf.float32) xy0_mask = tf.cast(A, tf.float32) xy1_mask = tf.cast(A, tf.float32) condition0 = tf.equal(xy0_mask, 2) xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask) xy0_mask = 1.0 - xy0_mask condition1 = tf.equal(xy1_mask, 2) xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask) # One hot representation of chosen marine. # [batch_size, 2] pi_xy0 = train_model.pi_xy0 pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0 pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024), axis=1) logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy0, labels=XY0) logpac_xy0 *= tf.stop_gradient(pac_weight) logpac_xy0 *= tf.cast(xy0_mask, tf.float32) pi_xy1 = train_model.pi_xy1 pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0 pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024), axis=1) # 1D? 2D? logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=pi_xy1, labels=XY1) logpac_xy1 *= tf.stop_gradient(pac_weight) logpac_xy1 *= tf.cast(xy1_mask, tf.float32) pg_loss = tf.reduce_mean(ADV * neglogpac) pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0) pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1) vf_ = tf.squeeze(train_model.vf) vf_r = tf.concat([ tf.ones([nscripts * nsteps]), tf.zeros([(nprocs - nscripts) * nsteps]) ], axis=0) * TD_TARGET vf_masked = vf_ * tf.squeeze(script_mask) + vf_r #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps] vf_loss = losses.mean_squared_error(vf_masked, TD_TARGET) entropy_a = tf.reduce_mean(cat_entropy(train_model.pi)) entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0)) entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1)) entropy = entropy_a + entropy_xy0 + entropy_xy1 loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef params = find_trainable_variables("model") grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _ = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) self.logits = logits = train_model.pi # xy0 self.params_common = params_common = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common') self.params_xy0 = params_xy0 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy0 = grads_xy0 = tf.gradients( train_loss_xy0, params_xy0) if max_grad_norm is not None: grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm) grads_xy0 = list(zip(grads_xy0, params_xy0)) trainer_xy0 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy0 = trainer_xy0.apply_gradients(grads_xy0) # xy1 self.params_xy1 = params_xy1 = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss self.grads_check_xy1 = grads_xy1 = tf.gradients( train_loss_xy1, params_xy1) if max_grad_norm is not None: grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm) grads_xy1 = list(zip(grads_xy1, params_xy1)) trainer_xy1 = tf.train.RMSPropOptimizer(learning_rate=lr, decay=alpha, epsilon=epsilon) _train_xy1 = trainer_xy1.apply_gradients(grads_xy1) self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, td_targets, masks, actions, xy0, xy1, values): advs = td_targets - values for step in range(len(obs)): cur_lr = self.lr.value() td_map = { train_model.X: obs, A: actions, XY0: xy0, XY1: xy1, ADV: advs, TD_TARGET: td_targets, PG_LR: cur_lr } if states != []: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _, \ policy_loss_xy0, policy_entropy_xy0, _, \ policy_loss_xy1, policy_entropy_xy1, _ = sess.run( [pg_loss, vf_loss, entropy, _train, pg_loss_xy0, entropy_xy0, _train_xy0, pg_loss_xy1, entropy_xy1, _train_xy1], td_map) return policy_loss, value_loss, policy_entropy, \ policy_loss_xy0, policy_entropy_xy0, \ policy_loss_xy1, policy_entropy_xy1 def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) self.train = train self.save = save self.load = load self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state print("global_variables_initializer start") tf.global_variables_initializer().run(session=sess) print("global_variables_initializer complete")
def initialize(self) -> None: keras.backend.set_session(self._session) ############################################################################## # Q-network # ############################################################################## self._sensor_input_tensor = tf.placeholder( dtype=tf.float32, shape=(None, *self._sensor_state_shape), name="sensor_input", ) self._heat_input_tensor = tf.placeholder( dtype=tf.float32, shape=(None, *self._heat_state_shape), name="heat_input") # self._position_input_tensor = tf.placeholder( # dtype=tf.float32, # shape=(None, *self._position_state_shape), # name="position_input" # ) # conv1 = Conv2D( # filters=self._n_sensors // 2, # kernel_size=(1, self._window_size), # data_format="channels_last", # activation="relu", # name="conv1", # ) # conv_output = conv1(self._sensor_input_tensor) # # conv2 = Conv2D( # filters=self._n_sensors // 4, # kernel_size=(1, self._window_size), # data_format="channels_last", # activation="relu", # name="conv2", # ) # conv_output = conv2(conv_output) # # flatten_conv = Flatten(name="flatten_conv") # flattened_conv = flatten_conv(conv_output) concat = Concatenate(name="concat") concatenated_input = concat([ self._sensor_input_tensor, self._heat_input_tensor, # self._position_input_tensor, ]) hidden_dense0 = Dense(units=self._n_sensor_inputs * 8, activation="relu", name="hidden_dense1") x = hidden_dense0(concatenated_input) hidden_dense1 = Dense(units=self._n_sensor_inputs * 4, activation="relu", name="hidden_dense1") x = hidden_dense1(x) hidden_dense2 = Dense(units=self._n_sensor_inputs * 2, activation="relu", name="hidden_dense2") x = hidden_dense2(x) hidden_dense3 = Dense(units=self._n_sensor_inputs, activation="relu", name="hidden_dense3") x = hidden_dense3(x) output_layer = Dense(units=self._n_output_angles + 1, name="action_quality") self._actions_qualities_tensor = output_layer(x) self._action_index_tensor = tf.argmax(self._actions_qualities_tensor, axis=1, name="output") ################################################################################ # Updating Q-network # ################################################################################ self._chosen_actions_tensor = tf.placeholder(dtype=tf.int32, shape=(None, ), name="chosen_actions") self._rewards_tensor = tf.placeholder(dtype=tf.float32, shape=(None, ), name="discounted_rewards") self._terminates_tensor = tf.placeholder(dtype=tf.float32, shape=(None, ), name="episode_terminated") self._replay_next_states_qualities_tensor = tf.placeholder( dtype=tf.float32, shape=self._actions_qualities_tensor.shape, name="replay_next_states_qualities") next_state_indices = tf.stack((tf.range( 0, tf.shape(self._rewards_tensor)[0]), self._chosen_actions_tensor), axis=1) responsible_qualities = tf.gather_nd( self._replay_next_states_qualities_tensor, next_state_indices) # noinspection PyTypeChecker target_quality = (self._rewards_tensor + self._terminates_tensor * responsible_qualities * self._process_config.reward_discount_coef) tf_range = tf.range(0, tf.shape(self._rewards_tensor)[0], dtype=tf.int32) state_indices = tf.stack((tf_range, self._chosen_actions_tensor), axis=1) current_quality = tf.gather_nd(self._actions_qualities_tensor, state_indices) loss = losses.mean_squared_error(target_quality, current_quality, reduction=losses.Reduction.MEAN) optimizer = tf.train.AdamOptimizer(learning_rate=0.0001) self._update_model = optimizer.minimize(loss) self._session.run(tf.global_variables_initializer()) self._saver = tf.train.Saver()
def __init__(self, optimiser, policy, env, nsteps, vf_coef=0.5, max_grad_norm=0.5, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6)): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs * nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) Ent_Coeff = tf.placeholder(tf.float32, []) # for Entropy # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Value loss vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy * Ent_Coeff + vf_loss * vf_coef # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C if optimiser == 'RMSProp': trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) elif optimiser == 'SGD': trainer = tf.train.GradientDescentOptimizer(learning_rate=LR) _train = trainer.apply_gradients(grads) #https://stackoverflow.com/a/45624533 _slot_vars = [ trainer.get_slot(var, name) for name in trainer.get_slot_names() for var in params ] SLOTS = [tf.placeholder(tf.float32, slot.shape) for slot in _slot_vars] _set_slots = [var.assign(SLOTS[i]) for i, var in enumerate(_slot_vars)] def get_opt_state(): return sess.run(_slot_vars) def set_opt_state(state): feed = {k: v for k, v in zip(SLOTS, state)} return sess.run(_set_slots, feed) def train(obs, states, rewards, masks, actions, values, ent_coeff): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, Ent_Coeff: ent_coeff, LR: 1.0 } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy # Only this bit added def get_mean_std_neg_ll(obs, actions): td_map = {train_model.X: obs, A: actions} vals = sess.run( [train_model.pd.mean, train_model.pd.std, neglogpac], td_map) return vals self.get_mean_std_neg_ll = get_mean_std_neg_ll self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) self.get_opt_state = get_opt_state self.set_opt_state = set_opt_state tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, lr=1e-3, alpha=0.99, epsilon=1e-5): sess = tf.Session() with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model for sampling step_model = policy(nenvs, 1, sess) # train_model to train our network train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) # total_loss = Policy gradient loss - entropy * entropy coeff + value coeff * value loss # policy loss neglogpac = train_model.pd.neglogp(A) # L = -log(pi(a|s)) * Adv(s, a) pg_loss = tf.reduce_mean(ADV * neglogpac) #entropy is used to improve exploration by limiting the premature # convergence to suboptimal policy entropy = tf.reduce_mean(train_model.pd.entropy()) # value loss vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # update params using loss # 1. get model params params = find_trainable_variables("a2c_model") # 2. calculate the gradients grads = tf.gradients(loss, params) grads = list(zip(grads, params)) # 3. make op for one policy and value update step of A2C trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) def train(obs, states, rewards, mask, actions, values): # we calculate advantage A(s, a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values #for step in range(len(obs)): td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map) return policy_loss, value_loss, policy_entropy self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state
def __init__(self, action_dim, state_dim, lr=0.001, ent_coef=0.01, value_coef=0.5, reward_decay=0.95, output_graph=False): self.action_dim = action_dim self.state_dim = state_dim self.lr = lr self.ent_coef = ent_coef self.value_coef = value_coef # self.actor_lr = actor_lr # self.critic_lr = critic_lr self.gamma = reward_decay self.output_graph = output_graph tf.reset_default_graph() self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) if output_graph: tf.summary.FileWriter("logs/", self.sess.graph) with tf.name_scope("inputs"): self.tf_obs = tf.placeholder(tf.float32, [None, self.state_dim], name="observations") self.tf_ac = tf.placeholder(tf.float32, [None, self.action_dim], name="actions") self.advantage = tf.placeholder(tf.float32, [ None, ], name="advantage") self.R = tf.placeholder(tf.float32, [ None, ], name="return") # fc1 layer = tf.layers.dense( inputs=self.tf_obs, units=10, activation=tf.nn.tanh, kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), bias_initializer=tf.constant_initializer(0.1), name='actor_fc1') # fc2 mean_all_act = tf.layers.dense( inputs=layer, units=self.action_dim, activation=None, kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), bias_initializer=tf.constant_initializer(0.1), name='actor_fc2') logstd_act = tf.get_variable(name="logstd", shape=[1, self.action_dim], initializer=tf.zeros_initializer()) pdparam = tf.concat([mean_all_act, mean_all_act * 0.0 + logstd_act], axis=1) self.pd = DiagGaussianPd(pdparam) self.action = self.pd.sample() self.neglogp = self.pd.neglogp(self.action) # for critic network, we share the first layer with actor self.value = tf.layers.dense( input=layer, units=1, activation=None, kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3), bias_initializer=tf.constant_initializer(0.1), name="critic_fc2") with tf.name_scope("Loss"): # Total loss = Policy loss - entropy * ent_coef + value loss * value_coef pg_loss = tf.reduce_mean(self.advantage * self.neglogp) value_loss = losses.mean_squared_error( tf.squeeze(self.value, self.R)) entropy = tf.reduce_mean(self.pd.entropy()) # total loss loss = pg_loss - entropy * self.ent_coef + value_loss * self.value_coef with tf.name_scope("Train"): train_op = tf.train.AdamOptimizer(self.lr).minimize(loss) def step(self, observation): actions, values, neglogp = self.sess.run( [self.action, self.value, self.neglogp], feed_dict={self.tf_obs: observation[np.newaxis, :]}) return actions, values, neglogp def learn(self, obs, actions, rewards, values): # calculate adv = reward - V(s) # reward = r + yV(s') advs = rewards - values # value = self.sess.run(self.value, feed_dict={self.obs: state}) td_map = { self.tf_obs: obs, self.tf_ac: actions, self.advantage: advs, self.R: rewards } policy_loss, value_loss, policy_entropy, _ = self.sess.run( [pg_loss, value_loss, entropy, train_op], feed_dict=td_map) return policy_loss, value_loss, policy_entropy self.step = step self.learn = learn
def __init__(self, network, env, *, seed=None, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.5, max_grad_norm=0.5, lr=1e-5, lrschedule='constant', gamma=0.99, alpha=0.99, epsilon=1e-5, model_save_path=None, tb_log_path=None): """ Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. Parameters: ----------- :param network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see policies.py.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See policies.py.py for more details on using recurrent nets in policies.py :param env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) :param seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) :param nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) :param total_timesteps: int, total number of timesteps to train on (default: 80M) :param vf_coef: float, coefficient in front of value function loss in the total loss function (default: 0.5) :param ent_coef: float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01) :param max_grad_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) :param lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) :param lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output :param epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) :param alpha: float, RMSProp decay parameter (default: 0.99) :param gamma: float, reward discounting parameter (default: 0.99) :param model_save_path: str, the location to save model parameters (if None, auto saving) :param tb_log_path: str, the log location for tensorboard (if None, no logging) """ self.policy = build_policy(network) self.env = env self.nenvs = env.num_envs self.nsteps = nsteps nbatch = self.nenvs * nsteps self.seed = seed self.ent_coef = ent_coef self.vf_coef = vf_coef self.max_grad_norm = max_grad_norm self.lr = lr self.gamma = gamma self.alpha = alpha self.epsilon = epsilon self.total_timesteps = total_timesteps self.lrschedule = lrschedule self.model_save_path = model_save_path self.tb_log_path = None # tb_log_path self.sess = get_session() self.graph = self.sess.graph self.episode_reward = np.zeros((self.nenvs, )) with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling self.step_model = self.policy(self.sess, env.observation_space, env.action_space, self.nenvs, 1, self.nenvs, reuse=False) # train_model is used to train our network self.train_model = self.policy(self.sess, env.observation_space, env.action_space, self.nenvs, self.nsteps, nbatch, reuse=True) with tf.variable_scope('loss', reuse=False): self.action_ph = tf.placeholder(self.train_model.action.dtype, self.train_model.action.shape) self.adv_ph = tf.placeholder(tf.float32, [nbatch]) self.reward_ph = tf.placeholder(tf.float32, [nbatch]) self.lr_ph = tf.placeholder(tf.float32, []) # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = self.train_model.proba_distribution.neglogp( self.action_ph) # L = A(s,a) * -logpi(a|s) self.pg_loss = tf.reduce_mean(self.adv_ph * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. self.entropy = tf.reduce_mean( self.train_model.proba_distribution.entropy()) # Value loss self.vf_loss = losses.mean_squared_error( tf.squeeze(self.train_model.value_fn), self.reward_ph) self.reg_loss = tf.contrib.layers.apply_regularization( tf.contrib.layers.l2_regularizer(0.8), tf.trainable_variables()) self.loss = self.pg_loss - self.entropy * ent_coef + self.vf_loss * vf_coef + self.reg_loss tf.summary.scalar('lr', self.lr_ph) tf.summary.scalar('pg_loss', self.pg_loss) tf.summary.scalar('entropy', self.entropy) tf.summary.scalar('vf_loss', self.vf_loss) tf.summary.scalar('loss', self.loss) tf.summary.histogram('obs', self.train_model.obs_ph) # Update parameters using loss # 1. Get the model parameters params = tf.trainable_variables("a2c_model") # 2. Calculate the gradients self.grads = grads = tf.gradients(self.loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # 3. Make op for one policy and value update step of A2C trainer = tf.train.RMSPropOptimizer(learning_rate=self.lr_ph, decay=alpha, epsilon=epsilon) self.apply_backprop = trainer.apply_gradients(grads) self.lr_schedule = Scheduler(initial_value=lr, n_values=total_timesteps, schedule=lrschedule) self.step = self.step_model.step self.value = self.step_model.value self.initial_state = self.step_model.initial_state self.def_path_pre = os.path.dirname( os.path.abspath(__file__)) + '/tmp/' # default path prefix self.summary = tf.summary.merge_all() tf.global_variables_initializer().run(session=self.sess)
def __init__(self, policy, env, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs*nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Value loss vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks policy_loss, value_loss, policy_entropy, _ = sess.run( [pg_loss, vf_loss, entropy, _train], td_map ) return policy_loss, value_loss, policy_entropy self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)
def __init__(self, policy, env, nsteps, dropoutpi_keep_prob, dropoutpi_keep_prob_value, dropoutvf_keep_prob, dropoutvf_keep_prob_value, isbnpitrainmode, isbnvftrainmode, l1regpi, l2regpi, l1regvf, l2regvf, wclippi, wclipvf, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', regnologstd=False, regonlylogstd=False): sess = tf_util.get_session() nenvs = env.num_envs nbatch = nenvs * nsteps with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE): # step_model is used for sampling step_model = policy(nenvs, 1, sess) # train_model is used to train our network train_model = policy(nbatch, nsteps, sess) A = tf.placeholder(train_model.action.dtype, train_model.action.shape) ADV = tf.placeholder(tf.float32, [nbatch]) R = tf.placeholder(tf.float32, [nbatch]) LR = tf.placeholder(tf.float32, []) self.dropoutpi_keep_prob = dropoutpi_keep_prob self.dropoutpi_keep_prob_value = dropoutpi_keep_prob_value self.dropoutvf_keep_prob = dropoutvf_keep_prob self.dropoutvf_keep_prob_value = dropoutvf_keep_prob_value self.isbnpitrainmode = isbnpitrainmode self.isbnvftrainmode = isbnvftrainmode #REGULARIZATION self.toregularizepi = l1regpi > 0 or l2regpi > 0 self.toregularizevf = l1regvf > 0 or l2regvf > 0 self.toweightclippi = wclippi > 0 self.toweightclipvf = wclipvf > 0 # Calculate the loss # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Policy loss neglogpac = train_model.pd.neglogp(A) # L = A(s,a) * -logpi(a|s) pg_loss = tf.reduce_mean(ADV * neglogpac) # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # Value loss vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef if self.toregularizepi: print("Regularizing policy network: L1 = {}, L2 = {}".format( l1regpi, l2regpi)) regularizerpi = tf.contrib.layers.l1_l2_regularizer( scale_l1=l1regpi, scale_l2=l2regpi, scope='a2c_model/pi') all_trainable_weights_pi = find_trainable_variables('a2c_model/pi') regularization_penalty_pi = tf.contrib.layers.apply_regularization( regularizerpi, all_trainable_weights_pi) loss = loss + regularization_penalty_pi if self.toregularizevf: print("Regularizing value network: L1 = {}, L2 = {}".format( l1regvf, l2regvf)) regularizervf = tf.contrib.layers.l1_l2_regularizer( scale_l1=l1regvf, scale_l2=l2regvf, scope='a2c_model/vf') all_trainable_weights_vf = find_trainable_variables('a2c_model/vf') regularization_penalty_vf = tf.contrib.layers.apply_regularization( regularizervf, all_trainable_weights_vf) loss = loss + regularization_penalty_vf # Update parameters using loss # 1. Get the model parameters params = find_trainable_variables("a2c_model") # 2. Calculate the gradients grads = tf.gradients(loss, params) if max_grad_norm is not None: # Clip the gradients (normalize) grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da # 3. Make op for one policy and value update step of A2C trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon) _train = trainer.apply_gradients(grads) if self.toweightclippi: print("Weight clipping policy network = {}".format(wclippi)) policyparams = find_trainable_variables('a2c_model/pi') self._wclip_ops_pi = [] self.wclip_bounds_pi = [-wclippi, wclippi] for toclipvar in policyparams: if 'logstd' in toclipvar.name: continue self._wclip_ops_pi.append( tf.assign( toclipvar, tf.clip_by_value(toclipvar, self.wclip_bounds_pi[0], self.wclip_bounds_pi[1]))) self._wclip_op_pi = tf.group(*self._wclip_ops_pi) if self.toweightclipvf: print("Weight clipping value network = {}".format(wclipvf)) valueparams = find_trainable_variables('a2c_model/vf') self._wclip_ops_vf = [] self.wclip_bounds_vf = [-wclipvf, wclipvf] for toclipvar in valueparams: self._wclip_ops_vf.append( tf.assign( toclipvar, tf.clip_by_value(toclipvar, self.wclip_bounds_vf[0], self.wclip_bounds_vf[1]))) self._wclip_op_vf = tf.group(*self._wclip_ops_vf) lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule) def train(obs, states, rewards, masks, actions, values): # Here we calculate advantage A(s,a) = R + yV(s') - V(s) # rewards = R + yV(s') advs = rewards - values for step in range(len(obs)): cur_lr = lr.value() td_map = { train_model.X: obs, A: actions, ADV: advs, R: rewards, LR: cur_lr } if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks if self.dropoutpi_keep_prob is not None: td_map[ self.dropoutpi_keep_prob] = self.dropoutpi_keep_prob_value if self.dropoutvf_keep_prob is not None: td_map[ self.dropoutvf_keep_prob] = self.dropoutvf_keep_prob_value if self.isbnpitrainmode is not None: td_map[self.isbnpitrainmode] = True if self.isbnvftrainmode is not None: td_map[self.isbnvftrainmode] = True train_tensors = [pg_loss, vf_loss, entropy, _train] if self.toweightclippi: train_tensors.append(self._wclip_op_pi) if self.toweightclipvf: train_tensors.append(self._wclip_op_vf) policy_loss, value_loss, policy_entropy, _ = sess.run( train_tensors, td_map)[:4] return policy_loss, value_loss, policy_entropy self.train = train self.train_model = train_model self.step_model = step_model self.step = step_model.step self.value = step_model.value self.initial_state = step_model.initial_state self.save = functools.partial(tf_util.save_variables, sess=sess) self.load = functools.partial(tf_util.load_variables, sess=sess) tf.global_variables_initializer().run(session=sess)