def _build_train_ops(self): self.lr_c = tf.placeholder(tf.float32, shape=None, name='learning_rate_c') self.lr_a = tf.placeholder(tf.float32, shape=None, name='learning_rate_a') with tf.variable_scope('critic_train'): # self.reg_c = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.critic_vars]) self.loss_c = tf.reduce_mean(tf.square( self.td_error)) # + 0.001 * self.reg_c self.optim_c = tf.train.AdamOptimizer(self.lr_c) self.grads_c = self.optim_c.compute_gradients( self.loss_c, self.critic_vars) if self.clip_norm: self.grads_c = [(tf.clip_by_norm(grad, self.clip_norm), var) for grad, var in self.grads_c] self.train_op_c = self.optim_c.apply_gradients(self.grads_c) with tf.variable_scope('actor_train'): # self.reg_a = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.actor_vars]) # self.entropy_a =- tf.reduce_sum(self.actor * tf.log(self.actor)) self.loss_a = tf.reduce_mean( tf.stop_gradient(self.td_error) * tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.actor, labels=self.a), name='loss_actor') # + 0.001 * self.reg_a self.optim_a = tf.train.AdamOptimizer(self.lr_a) self.grads_a = self.optim_a.compute_gradients( self.loss_a, self.actor_vars) if self.clip_norm: self.grads_a = [(tf.clip_by_norm(grad, self.clip_norm), var) for grad, var in self.grads_a] self.train_op_a = self.optim_a.apply_gradients(self.grads_a) with tf.variable_scope('summary'): self.ep_reward = tf.placeholder(tf.float32, name='episode_reward') self.summary = [ tf.summary.scalar('loss/critic', self.loss_c), tf.summary.scalar('loss/actor', self.loss_a), tf.summary.scalar('episode_reward', self.ep_reward) ] self.summary += [ tf.summary.scalar('grads/a_' + var.name, tf.norm(grad)) for grad, var in self.grads_a if grad is not None ] self.summary += [ tf.summary.scalar('grads/c_' + var.name, tf.norm(grad)) for grad, var in self.grads_c if grad is not None ] self.merged_summary = tf.summary.merge_all( key=tf.GraphKeys.SUMMARIES) self.train_ops = [self.train_op_a, self.train_op_c] self.sess.run(tf.global_variables_initializer())
def _build_train_ops(self): self.lr_a = tf.placeholder(tf.float32, shape=None, name='learning_rate_actor') self.lr_c = tf.placeholder(tf.float32, shape=None, name='learning_rate_critic') self.clip_range = tf.placeholder(tf.float32, shape=None, name='ratio_clip_range') with tf.variable_scope('actor_train'): ratio = tf.exp(self.logp_a - self.old_logp_a) ratio_clipped = tf.clip_by_value(ratio, 1.0 - self.clip_range, 1.0 + self.clip_range) loss_a = -tf.reduce_mean( tf.minimum(self.adv * ratio, self.adv * ratio_clipped)) optim_a = tf.train.AdamOptimizer(self.lr_a) grads_a = optim_a.compute_gradients(loss_a, var_list=self.actor_vars) if self.clip_norm: grads_a = [(tf.clip_by_norm(g, self.clip_norm), v) for g, v in grads_a] self.train_op_a = optim_a.apply_gradients(grads_a) with tf.variable_scope('critic_train'): loss_c = tf.reduce_mean(tf.square(self.v_target - self.critic)) optim_c = tf.train.AdamOptimizer(self.lr_c) grads_c = optim_c.compute_gradients(loss_c, var_list=self.critic_vars) if self.clip_norm: grads_c = [(tf.clip_by_norm(g, self.clip_norm), v) for g, v in grads_c] self.train_op_c = optim_c.apply_gradients(grads_c) self.train_ops = [self.train_op_a, self.train_op_c] with tf.variable_scope('summary'): self.ep_reward = tf.placeholder(tf.float32, name='episode_reward') self.summary = [ tf.summary.scalar('loss/adv', tf.reduce_mean(self.adv)), tf.summary.scalar('loss/ratio', tf.reduce_mean(ratio)), tf.summary.scalar('loss/loss_actor', loss_a), tf.summary.scalar('loss/loss_critic', loss_c), tf.summary.scalar('episode_reward', self.ep_reward) ] # self.summary += [tf.summary.scalar('grads/' + v.name, tf.norm(g)) # for g, v in grads_a if g is not None] # self.summary += [tf.summary.scalar('grads/' + v.name, tf.norm(g)) # for g, v in grads_c if g is not None] self.merged_summary = tf.summary.merge_all( key=tf.GraphKeys.SUMMARIES) self.sess.run(tf.global_variables_initializer())
def two_linear( self, xin, linear_size, residual, dropout_keep_prob, max_norm, batch_norm, dtype, idx ): """ Make a bi-linear block with optional residual connection Args xin: the batch that enters the block linear_size: integer. The size of the linear units residual: boolean. Whether to add a residual connection dropout_keep_prob: float [0,1]. Probability of dropping something out max_norm: boolean. Whether to clip weights to 1-norm batch_norm: boolean. Whether to do batch normalization dtype: type of the weigths. Usually tf.float32 idx: integer. Number of layer (for naming/scoping) Returns y: the batch after it leaves the block """ with vs.variable_scope( "two_linear_"+str(idx) ) as scope: input_size = int(xin.get_shape()[1]) # Linear 1 w2 = tf.get_variable( name="w2_"+str(idx), initializer=kaiming, shape=[input_size, linear_size], dtype=dtype) b2 = tf.get_variable( name="b2_"+str(idx), initializer=kaiming, shape=[linear_size], dtype=dtype) w2 = tf.clip_by_norm(w2,1) if max_norm else w2 y = tf.matmul(xin, w2) + b2 if batch_norm: y = tf.layers.batch_normalization(y,training=self.isTraining,name="batch_normalization1"+str(idx)) y = tf.nn.relu( y ) y = tf.nn.dropout( y, dropout_keep_prob ) # Linear 2 w3 = tf.get_variable( name="w3_"+str(idx), initializer=kaiming, shape=[linear_size, linear_size], dtype=dtype) b3 = tf.get_variable( name="b3_"+str(idx), initializer=kaiming, shape=[linear_size], dtype=dtype) w3 = tf.clip_by_norm(w3,1) if max_norm else w3 y = tf.matmul(y, w3) + b3 if batch_norm: y = tf.layers.batch_normalization(y,training=self.isTraining,name="batch_normalization2"+str(idx)) y = tf.nn.relu( y ) y = tf.nn.dropout( y, dropout_keep_prob ) # Residual every 2 blocks y = (xin + y) if residual else y return y
def optimize_normal(self, loss, params): ''' optimize loss: the loss. params: the params need to be optimized. ''' # the optimize. self.global_step = tf.Variable(0, name='global_step') if self.is_update_lr: self.lr = self.update_lr() else: self.lr = tf.Variable(self.init_lr, trainable=False) self.optimizer = tf.train.AdamOptimizer(self.lr) grads_and_vars = self.optimizer.compute_gradients(loss, params) if self.max_grad_norm != None: clipped_grads_and_vars = [ (tf.clip_by_norm(gv[0], self.max_grad_norm), gv[1]) for gv in grads_and_vars ] else: clipped_grads_and_vars = grads_and_vars inc = self.global_step.assign_add(1) optimize = None with tf.control_dependencies([inc]): optimize = self.optimizer.apply_gradients(clipped_grads_and_vars) return optimize
def _add_train_graph(self): """Define the training operation.""" mc = self.mc self.global_step = tf.Variable(0, name='global_step', trainable=False) lr = tf.train.exponential_decay(mc.LEARNING_RATE, self.global_step, mc.DECAY_STEPS, mc.LR_DECAY_FACTOR, staircase=True) tf.summary.scalar('learning_rate', lr) _add_loss_summaries(self.loss) opt = tf.train.MomentumOptimizer(learning_rate=lr, momentum=mc.MOMENTUM) grads_vars = opt.compute_gradients(self.loss, tf.trainable_variables()) with tf.variable_scope('clip_gradient') as scope: for i, (grad, var) in enumerate(grads_vars): grads_vars[i] = (tf.clip_by_norm(grad, mc.MAX_GRAD_NORM), var) apply_gradient_op = opt.apply_gradients(grads_vars, global_step=self.global_step) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name, var) for grad, var in grads_vars: if grad is not None: tf.summary.histogram(var.op.name + '/gradients', grad) with tf.control_dependencies([apply_gradient_op]): self.train_op = tf.no_op(name='train')
def linear(input_features, output_size, weight_max_norm, weight_initializer, bias_initializer, name): """Builds a linear layer. Args: input_features: A tensor for input features. Shape = [..., feature_dim]. output_size: An integer for the number of output nodes. weight_max_norm: A float for the maximum weight norm to clip at. Use non-positive to ignore. weight_initializer: A function handle for kernel weight initializer. bias_initializer: A function handle for bias initializer. name: A string for the name scope. Returns: A tensor for the output logits. Shape = [..., output_size]. """ with tf.variable_scope(name, reuse=tf.AUTO_REUSE): weights = tf.get_variable( name='weight', shape=[input_features.shape.as_list()[-1], output_size], initializer=weight_initializer) if weight_max_norm > 0.0: weights = tf.clip_by_norm(weights, clip_norm=weight_max_norm) bias = tf.get_variable(name='bias', shape=[output_size], initializer=bias_initializer) return tf.linalg.matmul(input_features, weights) + bias
def clip_gradients_by_norm(grads_and_vars, add_to_summary=False): if add_to_summary: for grad, var in grads_and_vars: if grad is not None: variable_summaries(grad, 'grad/{}'.format(var.name[:-2]), 'full') variable_summaries(tf.abs(grad), 'grad/abs/{}'.format(var.name[:-2]), 'full') # Clip by norm. Grad can be null when not training some modules. with tf.name_scope('clip_gradients_by_norm'): grads_and_vars = [(tf.check_numerics(tf.clip_by_norm(gv[0], 10.), 'Invalid gradient'), gv[1]) if gv[0] is not None else gv for gv in grads_and_vars] if add_to_summary: for grad, var in grads_and_vars: if grad is not None: variable_summaries(grad, 'clipped_grad/{}'.format(var.name[:-2]), 'full') variable_summaries(tf.abs(grad), 'clipped_grad/{}'.format(var.name[:-2]), 'full') return grads_and_vars
def make_train_step(self): trainable_vars = self.sess.graph.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES) if self.args.get('--freeze-graph-model'): graph_vars = set( self.sess.graph.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope="graph_model")) filtered_vars = [] for var in trainable_vars: if var not in graph_vars: filtered_vars.append(var) else: print("Freezing weights of variable %s." % var.name) trainable_vars = filtered_vars #optimize the loss optimizer = tf.train.AdamOptimizer(self.params['learning_rate']) grads_and_vars = optimizer.compute_gradients(self.ops['loss'], var_list=trainable_vars) clipped_grads = [] for grad, var in grads_and_vars: if grad is not None: clipped_grads.append( (tf.clip_by_norm(grad, self.params['clamp_gradient_norm']), var)) else: clipped_grads.append((grad, var)) self.ops['train_step'] = optimizer.apply_gradients(clipped_grads) # Initialize newly-introduced variables: #self.sess.run(tf.local_variables_initializer()) self.sess.run(tf.global_variables_initializer())
def flatgrad(loss, var_list, clip_norm=None): grads = tf.gradients(loss, var_list) if clip_norm is not None: grads = [tf.clip_by_norm(grad, clip_norm=clip_norm) for grad in grads] return tf.concat(axis=0, values=[ tf.reshape(grad if grad is not None else tf.zeros_like(v), [numel(v)]) for (v, grad) in zip(var_list, grads) ])
def minimize_and_clip(optimizer, objective, var_list, clip_val=10): if clip_val is None: return optimizer.minimize(objective, var_list=var_list) else: gradients = optimizer.compute_gradients(objective, var_list=var_list) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, clip_val), var) return optimizer.apply_gradients(gradients)
def flatgrad(loss, var_list, clip_norm=None): grads = tf.gradients(loss, var_list) if clip_norm is not None: grads = [tf.clip_by_norm(grad, clip_norm=clip_norm) for grad in grads] return tf.concat([ tf.reshape(g if g is not None else tf.zeros_like(v), [-1]) for v, g in zip(var_list, grads) ], axis=0)
def minimize_and_clip(optimizer, objective, var_list, clip_val=10): """Minimized `objective` using `optimizer` w.r.t. variables in `var_list` while ensure the norm of the gradients for each variable is clipped to `clip_val` """ gradients = optimizer.compute_gradients(objective, var_list=var_list) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm(grad, clip_val), var) return optimizer.apply_gradients(gradients)
def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau, batch_size): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.action_bound = action_bound self.learning_rate = learning_rate self.tau = tau self.batch_size = batch_size # Actor Network self.inputs, self.out, self.scaled_out = self.create_actor_network() self.network_params = tf.trainable_variables() # Target Network self.target_inputs, self.target_out, self.target_scaled_out = self.create_actor_network( ) self.target_network_params = tf.trainable_variables( )[len(self.network_params):] # Op for periodically updating target network with online network # weights self.update_target_network_params = \ [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau)) for i in range(len(self.target_network_params))] # This gradient will be provided by the critic network self.action_gradient = tf.placeholder(tf.float32, [None, self.a_dim]) # Combine the gradients here self.unnormalized_actor_gradients = tf.gradients( self.scaled_out, self.network_params, -self.action_gradient) self.clipped_gradients = [ tf.clip_by_norm(grad, 5.0) for grad in self.unnormalized_actor_gradients ] self.actor_gradients = list( map(lambda x: tf.div(x, self.batch_size), self.clipped_gradients)) # Optimization Op self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(self.update_ops): self.optimize = tf.train.AdamOptimizer( self.learning_rate).apply_gradients( zip(self.actor_gradients, self.network_params)) self.num_trainable_vars = len(self.network_params) + len( self.target_network_params)
def langevin_step(counter, x_mod): x_mod = x_mod + tf.random_normal( tf.shape(x_mod), mean=0.0, stddev=0.005 * FLAGS.rescale * FLAGS.noise_scale) energy_noise = energy_start = tf.concat([ model.forward(x_mod, weights[0], label=LABEL_SPLIT[j], reuse=True, stop_at_grad=False, stop_batch=True) ], axis=0) x_grad, label_grad = tf.gradients(FLAGS.temperature * energy_noise, [x_mod, LABEL_SPLIT[j]]) energy_noise_old = energy_noise lr = FLAGS.step_lr if FLAGS.proj_norm != 0.0: if FLAGS.proj_norm_type == 'l2': x_grad = tf.clip_by_norm(x_grad, FLAGS.proj_norm) elif FLAGS.proj_norm_type == 'li': x_grad = tf.clip_by_value(x_grad, -FLAGS.proj_norm, FLAGS.proj_norm) else: print("Other types of projection are not supported!!!") assert False # Clip gradient norm for now if FLAGS.hmc: # Step size should be tuned to get around 65% acceptance def energy(x): return FLAGS.temperature * \ model.forward(x, weights[0], label=LABEL_SPLIT[j], reuse=True) x_last = hmc(x_mod, 15., 10, energy) else: x_last = x_mod - (lr) * x_grad x_mod = x_last x_mod = tf.clip_by_value(x_mod, 0, FLAGS.rescale) counter = counter + 1 return counter, x_mod
def add_optimizer_op(self, scope): """ Set self.train_op and self.grad_norm Args: scope: (string) scope name, that specifies if target network or not """ ############################################################## """ TODO: 1. get Adam Optimizer 2. compute grads with respect to variables in scope for self.loss 3. if self.config.grad_clip is True, then clip the grads by norm using self.config.clip_val 4. apply the gradients and store the train op in self.train_op (sess.run(train_op) must update the variables) 5. compute the global norm of the gradients (which are not None) and store this scalar in self.grad_norm HINT: you may find the following functions useful - tf.get_collection - optimizer.compute_gradients - tf.clip_by_norm - optimizer.apply_gradients - tf.global_norm you can access config variables by writing self.config.variable_name """ ############################################################## #################### YOUR CODE HERE - 8-12 lines ############# opt = tf.train.AdamOptimizer(learning_rate=self.lr) scope_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope) # print(tf.GraphKeys.GLOBAL_VARIABLES) # print(scope_vars) # print(scope) grads = opt.compute_gradients(self.loss, scope_vars) if self.config.grad_clip: grads=[(tf.clip_by_norm(grad, self.config.clip_val), var) for grad, var in grads] self.train_op = opt.apply_gradients(grads) self.grad_norm = tf.global_norm([grad[0] for grad in grads])
def _compute_gradients(self, loss, var_list=None): # Sanity check assert isinstance(loss, tf.Tensor) # Compute gradients using default method grads_and_vars = self._tf_optimizer.compute_gradients( loss, var_list=var_list) # Deal with NaN if necessary if hub.clip_nan_protection: grads_and_vars = [(self._deal_with_nan(grad), var) for grad, var in grads_and_vars] # Apply lr decay if necessary lr_decay = hub.clip_lr_multiplier if lr_decay < 1.0: assert lr_decay > 0 grads_and_vars = [(grad * lr_decay, var) for grad, var in grads_and_vars] # Clip gradient if necessary if self._threshold > 0: bound = self._threshold if self._method in ('norm', 'value', 'avg_norm'): if self._method == 'norm': method = lambda g: tf.clip_by_norm(g, bound) elif self._method == 'value': method = lambda g: tf.clip_by_value(g, -bound, bound) else: method = lambda g: tf.clip_by_average_norm(g, bound) grads_and_vars = [(method(grad), var) for grad, var in grads_and_vars] else: assert self._method == 'global_norm' grads = [g for g, _ in grads_and_vars] clipped_grads, _ = tf.clip_by_global_norm( grads, self._threshold) vars_ = [v for _, v in grads_and_vars] grads_and_vars = list(zip(clipped_grads, vars_)) return grads_and_vars
def __init__(self, state_size, action_size, alpha=0.01, clip_norm=None, minibatch_size=5, **kwargs): """ Parameters ---------- state_size, action_size : int Size of the environment state space and action space alpha : float, optional Network learning rate clip_norm : float, optional Max gradient magnitude for clipping, default no clipping minibatch_size : int, optional Size of minibatches for updating **kwargs Additional keyword arguments passed to `SingleLayerNetwork` """ super().__init__(state_size, action_size, **kwargs) self.k = int(minibatch_size) self.nextQ = tf.placeholder(shape=[None, action_size], dtype=tf.float32) loss = tf.reduce_sum(tf.square(self.nextQ - self.Q_est)) trainer = tf.train.RMSPropOptimizer(alpha) if clip_norm is not None: grads = trainer.compute_gradients(loss, [self.W, self.w_in, self.b_in]) cap_grads = [(tf.clip_by_norm(grad, clip_norm), var) for grad, var in grads] self.updateModel = trainer.apply_gradients(cap_grads) else: self.updateModel = trainer.minimize( loss, var_list=[self.W, self.w_in, self.b_in]) self.var_init()
def add_optimizer_op(self, scope): """ Set self.train_op and self.grad_norm Args: scope: (string) name of the scope whose variables we are differentiating with respect to """ ############################################################## """ TODO: 1. get Adam Optimizer 2. compute grads with respect to variables in scope for self.loss 3. if self.config.grad_clip is True, then clip the grads by norm using self.config.clip_val 4. apply the gradients and store the train op in self.train_op (sess.run(train_op) must update the variables) 5. compute the global norm of the gradients (which are not None) and store this scalar in self.grad_norm HINT: you may find the following functions useful - tf.get_collection - optimizer.compute_gradients - tf.clip_by_norm - optimizer.apply_gradients - tf.global_norm you can access config variables by writing self.config.variable_name """ ############################################################## #################### YOUR CODE HERE - 8-12 lines ############# optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) scope_variable = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope) grads_and_vars = optimizer.compute_gradients(self.loss, scope_variable) if self.config.grad_clip: clipped_grads_and_vars = [(tf.clip_by_norm(item[0], self.config.clip_val), item[1]) for item in grads_and_vars] self.train_op = optimizer.apply_gradients(clipped_grads_and_vars) self.grad_norm = tf.global_norm([item[0] for item in grads_and_vars])
def clip_by_norm(v, clip_norm): dim = len(v.get_shape()) return tf.clip_by_norm(v, clip_norm, axes=[i for i in range(dim - 1)])
def dpg(q_max, a_max, dqda_clipping=None, clip_norm=False, name="DpgLearning"): """Implements the Deterministic Policy Gradient (DPG) loss as a TensorFlow Op. This op implements the loss for the `actor`, the `critic` can instead be updated by minimizing the `value_ops.td_learning` loss. See "Deterministic Policy Gradient Algorithms" by Silver, Lever, Heess, Degris, Wierstra, Riedmiller (http://proceedings.mlr.press/v32/silver14.pdf). Args: q_max: Tensor holding Q-values generated by Q network with the input of (state, a_max) pair, shape `[B]`. a_max: Tensor holding the optimal action, shape `[B, action_dimension]`. dqda_clipping: `int` or `float`, clips the gradient dqda element-wise between `[-dqda_clipping, dqda_clipping]`. clip_norm: Whether to perform dqda clipping on the vector norm of the last dimension, or component wise (default). name: name to prefix ops created within this op. Returns: A namedtuple with fields: * `loss`: a tensor containing the batch of losses, shape `[B]`. * `extra`: a namedtuple with fields: * `q_max`: Tensor holding the optimal Q values, `[B]`. * `a_max`: Tensor holding the optimal action, `[B, action_dimension]`. * `dqda`: Tensor holding the derivative dq/da, `[B, action_dimension]`. Raises: ValueError: If `q_max` doesn't depend on `a_max` or if `dqda_clipping <= 0`. """ # DPG op. with tf.name_scope(name, values=[q_max, a_max]): # Calculate the gradient dq/da. dqda = tf.gradients([q_max], [a_max])[0] # Check that `q_max` depends on `a_max`. if dqda is None: raise ValueError("q_max needs to be a function of a_max") # Clipping the gradient dq/da. if dqda_clipping is not None: if dqda_clipping <= 0: raise ValueError( "dqda_clipping should be bigger than 0, {} found".format( dqda_clipping)) if clip_norm: dqda = tf.clip_by_norm(dqda, dqda_clipping, axes=-1) else: dqda = tf.clip_by_value(dqda, -1. * dqda_clipping, dqda_clipping) # Target_a ensures correct gradient calculated during backprop. target_a = dqda + a_max # Stop the gradient going through Q network when backprop. target_a = tf.stop_gradient(target_a) # Gradient only go through actor network. loss = 0.5 * tf.reduce_sum(tf.square(target_a - a_max), axis=-1) return base_ops.LossOutput( loss, DPGExtra(q_max=q_max, a_max=a_max, dqda=dqda))
def build_model(self): self.X = tf.placeholder(tf.int32, [self.batch_size], name='input') if self.n_samples: self.Y = tf.placeholder(tf.int32, [self.batch_size + self.n_samples], name='output') else: self.Y = tf.placeholder(tf.int32, [self.batch_size], name='output') self.Behavior = tf.placeholder(tf.int32, [self.batch_size], name='behavior') self.state_item = [ tf.placeholder(tf.float32, [self.batch_size, self.item_size], name='rnn_state') for _ in range(self.layers) ] self.state_beha = [ tf.placeholder(tf.float32, [self.batch_size, self.behavior_size], name='rnn_state') for _ in range(self.layers) ] self.global_step = tf.Variable(0, name='global_step', trainable=False) self.alpha = tf.get_variable('alpha', shape=[1], initializer=tf.constant_initializer(0.5)) with tf.variable_scope('gru_layer_item'): sigma = self.sigma if self.sigma != 0 else np.sqrt( 6.0 / (self.n_items + self.item_size)) if self.init_as_normal: initializer = tf.random_normal_initializer(mean=0, stddev=sigma) else: initializer = tf.random_uniform_initializer(minval=-sigma, maxval=sigma) embedding_item = tf.get_variable('embedding_item', [self.n_items, self.item_size], initializer=initializer) cell_item = rnn_cell.GRUCell(self.item_size, activation=self.hidden_act) drop_cell_item = rnn_cell.DropoutWrapper( cell_item, output_keep_prob=self.dropout_p_hidden) stacked_cell_item = rnn_cell.MultiRNNCell([drop_cell_item] * self.layers) inputs_item = tf.nn.embedding_lookup(embedding_item, self.X) output_item, state_item = stacked_cell_item( inputs_item, tuple(self.state_item)) self.final_state_item = state_item with tf.variable_scope('gru_layer_beha'): embedding_behavior = tf.get_variable( 'embedding_behavior', [self.n_behaviors, self.behavior_size], initializer=initializer) inputs_beha = tf.nn.embedding_lookup(embedding_behavior, self.Behavior) with tf.variable_scope('output'): output = tf.concat([output_item, inputs_beha], axis=1) output = tf.layers.dense(output, self.latent_size, activation='tanh') softmax_W = tf.get_variable('softmax_w', [self.n_items, self.latent_size], initializer=initializer) softmax_b = tf.get_variable( 'softmax_b', [self.n_items], initializer=tf.constant_initializer(0.0)) if self.is_training: ''' Use other examples of the minibatch as negative samples. ''' sampled_W = tf.nn.embedding_lookup(softmax_W, self.Y) sampled_b = tf.nn.embedding_lookup(softmax_b, self.Y) logits = tf.matmul(output, sampled_W, transpose_b=True) + sampled_b self.yhat = self.final_activation(logits) self.cost = self.loss_function(self.yhat) else: logits = tf.matmul(output, softmax_W, transpose_b=True) + softmax_b self.yhat = self.final_activation(logits) if not self.is_training: return self.lr = tf.maximum( 1e-5, tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps, self.decay, staircase=True)) ''' Try different optimizers. ''' # optimizer = tf.train.AdagradOptimizer(self.lr) optimizer = tf.train.AdamOptimizer(self.lr) # optimizer = tf.train.AdadeltaOptimizer(self.lr) # optimizer = tf.train.RMSPropOptimizer(self.lr) tvars = tf.trainable_variables() gvs = optimizer.compute_gradients(self.cost, tvars) if self.grad_cap > 0: capped_gvs = [(tf.clip_by_norm(grad, self.grad_cap), var) for grad, var in gvs] else: capped_gvs = gvs self.train_op = optimizer.apply_gradients(capped_gvs, global_step=self.global_step)
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model definition entry. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN and EVAL. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. Raises: RuntimeError: if both ckpt and backbone_ckpt are set. """ is_tpu = params['strategy'] == 'tpu' if params['img_summary_steps']: utils.image('input_image', features, is_tpu) training_hooks = [] params['is_training_bn'] = (mode == tf.estimator.ModeKeys.TRAIN) if params['use_keras_model']: def model_fn(inputs): model = efficientdet_keras.EfficientDetNet( config=hparams_config.Config(params)) cls_out_list, box_out_list = model(inputs, params['is_training_bn']) cls_outputs, box_outputs = {}, {} for i in range(params['min_level'], params['max_level'] + 1): cls_outputs[i] = cls_out_list[i - params['min_level']] box_outputs[i] = box_out_list[i - params['min_level']] return cls_outputs, box_outputs else: model_fn = functools.partial(model, config=hparams_config.Config(params)) precision = utils.get_precision(params['strategy'], params['mixed_precision']) cls_outputs, box_outputs = utils.build_model_with_precision( precision, model_fn, features, params['is_training_bn']) levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. det_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs, labels, params) reg_l2loss = reg_l2_loss(params['weight_decay']) total_loss = det_loss + reg_l2loss if mode == tf.estimator.ModeKeys.TRAIN: utils.scalar('lrn_rate', learning_rate, is_tpu) utils.scalar('trainloss/cls_loss', cls_loss, is_tpu) utils.scalar('trainloss/box_loss', box_loss, is_tpu) utils.scalar('trainloss/det_loss', det_loss, is_tpu) utils.scalar('trainloss/reg_l2_loss', reg_l2loss, is_tpu) utils.scalar('trainloss/loss', total_loss, is_tpu) train_epochs = tf.cast(global_step, tf.float32) / params['steps_per_epoch'] utils.scalar('train_epochs', train_epochs, is_tpu) moving_average_decay = params['moving_average_decay'] if moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() if mode == tf.estimator.ModeKeys.TRAIN: if params['optimizer'].lower() == 'sgd': optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) elif params['optimizer'].lower() == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate) else: raise ValueError('optimizers should be adam or sgd') if is_tpu: optimizer = tf.tpu.CrossShardOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = tf.trainable_variables() if variable_filter_fn: var_list = variable_filter_fn(var_list) if params.get('clip_gradients_norm', None): logging.info('clip gradients norm by %f', params['clip_gradients_norm']) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) with tf.name_scope('clip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] # First clip each variable's norm, then clip global norm. clip_norm = abs(params['clip_gradients_norm']) clipped_grads = [ tf.clip_by_norm(g, clip_norm) if g is not None else None for g in grads ] clipped_grads, _ = tf.clip_by_global_norm( clipped_grads, clip_norm) utils.scalar('gradient_norm', tf.linalg.global_norm(clipped_grads), is_tpu) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) if moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" if params['nms_configs'].get('pyfunc', True): detections_bs = [] nms_configs = params['nms_configs'] for index in range(kwargs['boxes'].shape[0]): detections = tf.numpy_function( functools.partial(nms_np.per_class_nms, nms_configs=nms_configs), [ kwargs['boxes'][index], kwargs['scores'][index], kwargs['classes'][index], tf.slice(kwargs['image_ids'], [index], [1]), tf.slice(kwargs['image_scales'], [index], [1]), params['num_classes'], nms_configs['max_output_size'], ], tf.float32) detections_bs.append(detections) detections_bs = postprocess.transform_detections( tf.stack(detections_bs)) else: # These two branches should be equivalent, but currently they are not. # TODO(tanmingxing): enable the non_pyfun path after bug fix. nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms( params, kwargs['boxes'], kwargs['scores'], kwargs['classes'], kwargs['image_scales']) img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1), nms_scores.dtype) detections_bs = [ img_ids * tf.ones_like(nms_scores), nms_boxes[:, :, 1], nms_boxes[:, :, 0], nms_boxes[:, :, 3] - nms_boxes[:, :, 1], nms_boxes[:, :, 2] - nms_boxes[:, :, 0], nms_scores, nms_classes, ] detections_bs = tf.stack(detections_bs, axis=-1, name='detnections') if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) eval_metric = coco_metric.EvaluationMetric( testdev_dir=params['testdev_dir']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, tf.zeros([1])) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) eval_metric = coco_metric.EvaluationMetric( filename=params['val_json_file'], label_map=params['label_map']) coco_metrics = eval_metric.estimator_metric_fn( detections_bs, kwargs['groundtruth_data']) # Add metrics to output. cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) cls_outputs = postprocess.to_list(cls_outputs) box_outputs = postprocess.to_list(box_outputs) params['nms_configs']['max_nms_inputs'] = anchors.MAX_DETECTION_POINTS boxes, scores, classes = postprocess.pre_nms(params, cls_outputs, box_outputs) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'image_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], 'boxes': boxes, 'scores': scores, 'classes': classes, } eval_metrics = (metric_fn, metric_fn_inputs) checkpoint = params.get('ckpt') or params.get('backbone_ckpt') if checkpoint and mode == tf.estimator.ModeKeys.TRAIN: # Initialize the model from an EfficientDet or backbone checkpoint. if params.get('ckpt') and params.get('backbone_ckpt'): raise RuntimeError( '--backbone_ckpt and --checkpoint are mutually exclusive') if params.get('backbone_ckpt'): var_scope = params['backbone_name'] + '/' if params['ckpt_var_scope'] is None: # Use backbone name as default checkpoint scope. ckpt_scope = params['backbone_name'] + '/' else: ckpt_scope = params['ckpt_var_scope'] + '/' else: # Load every var in the given checkpoint var_scope = ckpt_scope = '/' def scaffold_fn(): """Loads pretrained model through scaffold function.""" logging.info('restore variables from %s', checkpoint) var_map = utils.get_ckpt_var_map( ckpt_path=checkpoint, ckpt_scope=ckpt_scope, var_scope=var_scope, skip_mismatch=params['skip_mismatch']) tf.train.init_from_checkpoint(checkpoint, var_map) return tf.train.Scaffold() elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay: def scaffold_fn(): """Load moving average variables for eval.""" logging.info('Load EMA vars with ema_decay=%f', moving_average_decay) restore_vars_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) else: scaffold_fn = None if is_tpu: return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, host_call=utils.get_tpu_host_call(global_step, params), scaffold_fn=scaffold_fn, training_hooks=training_hooks) else: # Profile every 1K steps. if params.get('profile', False): profile_hook = tf.estimator.ProfilerHook( save_steps=1000, output_dir=params['model_dir'], show_memory=True) training_hooks.append(profile_hook) # Report memory allocation if OOM; it will slow down the running. class OomReportingHook(tf.estimator.SessionRunHook): def before_run(self, run_context): return tf.estimator.SessionRunArgs( fetches=[], options=tf.RunOptions( report_tensor_allocations_upon_oom=True)) training_hooks.append(OomReportingHook()) logging_hook = tf.estimator.LoggingTensorHook( { 'step': global_step, 'det_loss': det_loss, 'cls_loss': cls_loss, 'box_loss': box_loss, }, every_n_iter=params.get('iterations_per_loop', 100), ) training_hooks.append(logging_hook) eval_metric_ops = (eval_metrics[0]( **eval_metrics[1]) if eval_metrics else None) return tf.estimator.EstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, eval_metric_ops=eval_metric_ops, scaffold=scaffold_fn() if scaffold_fn else None, training_hooks=training_hooks)
def build_model(self): """ neural network architecture """ if self.user_data: self.U = tf.placeholder(tf.int32, [self.batch_size], name='user_id') self.X = tf.placeholder(tf.int32, [self.batch_size], name='input') self.Y = tf.placeholder(tf.int32, [self.batch_size + self.n_samples], name='output') self.state = [ tf.placeholder(tf.float32, [self.batch_size, self.rnn_size], name='rnn_state') for _ in range(self.layers) ] self.global_step = tf.Variable(0, name='global_step', trainable=False) with tf.variable_scope('gru_layer'): # parameter initialization sigma = self.sigma if self.sigma != 0 else np.sqrt( 6.0 / (self.n_items + self.rnn_size)) if self.init_as_normal: initializer = tf.random_normal_initializer(mean=0, stddev=sigma) else: initializer = tf.random_uniform_initializer(minval=-sigma, maxval=sigma) embedding = tf.get_variable('embedding', [self.n_items, self.rnn_size], initializer=initializer) softmax_W = tf.get_variable('softmax_w', [self.n_items, self.rnn_size], initializer=initializer) softmax_b = tf.get_variable( 'softmax_b', [self.n_items], initializer=tf.constant_initializer(0.0)) if self.user_data: user_embedding = tf.get_variable('user_embedding', [self.n_users, self.rnn_size], initializer=initializer) cells = [] for i in range(self.layers): cell = rnn_cell.GRUCell(num_units=self.rnn_size, activation=self.hidden_act) # GRU cell in the hidden layer cell = rnn_cell.DropoutWrapper( cell, output_keep_prob=self.dropout_p_hidden) cells.append(cell) multi_cell = rnn_cell.MultiRNNCell( cells) # multiple GRU cells in hidden layers with dropout # input and output inputs = tf.nn.embedding_lookup(embedding, self.X) # self.X is input output, state = multi_cell(inputs, tuple(self.state)) if self.user_data: user_output = tf.nn.embedding_lookup(user_embedding, self.U) user_concatenated = tf.concat([output, user_output], axis=1) output = tf.layers.dense(user_concatenated, self.rnn_size, activation='tanh') self.final_state = state if self.is_training: ''' Use other examples of the minibatch as negative samples. ''' sampled_W = tf.nn.embedding_lookup(softmax_W, self.Y) sampled_b = tf.nn.embedding_lookup(softmax_b, self.Y) logits = tf.matmul(output, sampled_W, transpose_b=True) + sampled_b # sampled_W is transposed before multiplication self.yhat = self.final_activation(logits) self.cost = self.loss_function(self.yhat) else: # if not training logits = tf.matmul(output, softmax_W, transpose_b=True) + softmax_b self.yhat = self.final_activation(logits) if not self.is_training: return self.lr = tf.maximum( 1e-5, tf.train.exponential_decay(self.learning_rate, self.global_step, self.decay_steps, self.decay, staircase=True)) # set optimizer if self.optimizer == 'adagrad': self.optimizer = tf.train.AdagradOptimizer(self.lr) elif self.optimizer == 'adam': self.optimizer = tf.train.AdamOptimizer(self.lr) elif self.optimizer == 'adadelta': self.optimizer = tf.train.AdadeltaOptimizer(self.lr) elif self.optimizer == 'rmsprop': self.optimizer = tf.train.RMSPropOptimizer(self.lr) tvars = tf.trainable_variables() gvs = self.optimizer.compute_gradients(self.cost, tvars) if self.grad_cap > 0: capped_gvs = [(tf.clip_by_norm(grad, self.grad_cap), var) for grad, var in gvs] else: capped_gvs = gvs self.train_op = self.optimizer.apply_gradients( capped_gvs, global_step=self.global_step)
def __init__(self, linear_size, num_layers, residual, batch_norm, max_norm, batch_size, learning_rate, summaries_dir, predict_14=False, dtype=tf.float32): """Creates the linear + relu model Args linear_size: integer. number of units in each layer of the model num_layers: integer. number of bilinear blocks in the model residual: boolean. Whether to add residual connections batch_norm: boolean. Whether to use batch normalization max_norm: boolean. Whether to clip weights to a norm of 1 batch_size: integer. The size of the batches used during training learning_rate: float. Learning rate to start with summaries_dir: String. Directory where to log progress predict_14: boolean. Whether to predict 14 instead of 17 joints dtype: the data type to use to store internal variables """ # There are in total 17 joints in H3.6M and 16 in MPII (and therefore in stacked # hourglass detections). We settled with 16 joints in 2d just to make models # compatible (e.g. you can train on ground truth 2d and test on SH detections). # This does not seem to have an effect on prediction performance. self.HUMAN_2D_SIZE = 16 * 2 # In 3d all the predictions are zero-centered around the root (hip) joint, so # we actually predict only 16 joints. The error is still computed over 17 joints, # because if one uses, e.g. Procrustes alignment, there is still error in the # hip to account for! # There is also an option to predict only 14 joints, which makes our results # directly comparable to those in https://arxiv.org/pdf/1611.09010.pdf self.HUMAN_3D_SIZE = 14 * 3 if predict_14 else 16 * 3 self.input_size = self.HUMAN_2D_SIZE self.output_size = self.HUMAN_3D_SIZE self.isTraining = tf.placeholder(tf.bool, name="isTrainingflag") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") # Summary writers for train and test runs self.train_writer = tf.summary.FileWriter( os.path.join(summaries_dir, 'train')) self.test_writer = tf.summary.FileWriter( os.path.join(summaries_dir, 'test')) self.linear_size = linear_size self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=dtype, name="learning_rate") self.global_step = tf.Variable(0, trainable=False, name="global_step") decay_steps = 100000 # empirical decay_rate = 0.96 # empirical self.learning_rate = tf.train.exponential_decay( self.learning_rate, self.global_step, decay_steps, decay_rate) # === Transform the inputs === with vs.variable_scope("inputs"): # in=2d poses, out=3d poses enc_in = tf.placeholder(dtype, shape=[None, self.input_size], name="enc_in") dec_out = tf.placeholder(dtype, shape=[None, self.output_size], name="dec_out") self.encoder_inputs = enc_in self.decoder_outputs = dec_out # === Create the linear + relu combos === with vs.variable_scope("linear_model"): # === First layer, brings dimensionality up to linear_size === w1 = tf.get_variable(name="w1", initializer=kaiming, shape=[self.HUMAN_2D_SIZE, linear_size], dtype=dtype) b1 = tf.get_variable(name="b1", initializer=kaiming, shape=[linear_size], dtype=dtype) w1 = tf.clip_by_norm(w1, 1) if max_norm else w1 y3 = tf.matmul(enc_in, w1) + b1 if batch_norm: y3 = tf.layers.batch_normalization(y3, training=self.isTraining, name="batch_normalization") y3 = tf.nn.relu(y3) y3 = tf.nn.dropout(y3, self.dropout_keep_prob) # === Create multiple bi-linear layers === for idx in range(num_layers): y3 = self.two_linear(y3, linear_size, residual, self.dropout_keep_prob, max_norm, batch_norm, dtype, idx) # === Last linear layer has HUMAN_3D_SIZE in output === w4 = tf.get_variable(name="w4", initializer=kaiming, shape=[linear_size, self.HUMAN_3D_SIZE], dtype=dtype) b4 = tf.get_variable(name="b4", initializer=kaiming, shape=[self.HUMAN_3D_SIZE], dtype=dtype) w4 = tf.clip_by_norm(w4, 1) if max_norm else w4 y = tf.matmul(y3, w4) + b4 # === End linear model === # Store the outputs here self.outputs = y self.loss = tf.reduce_mean(tf.square(y - dec_out)) self.loss_summary = tf.summary.scalar('loss/loss', self.loss) # To keep track of the loss in mm self.err_mm = tf.placeholder(tf.float32, name="error_mm") self.err_mm_summary = tf.summary.scalar("loss/error_mm", self.err_mm) # Gradients and update operation for training the model. opt = tf.train.AdamOptimizer(self.learning_rate) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): # Update all the trainable parameters gradients = opt.compute_gradients(self.loss) self.gradients = [[] if i == None else i for i in gradients] self.updates = opt.apply_gradients(gradients, global_step=self.global_step) # Keep track of the learning rate self.learning_rate_summary = tf.summary.scalar( 'learning_rate/learning_rate', self.learning_rate) # To save the model self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=10)
def run_epoch(self, fileName): # 实例化配置参数对象 config = Config() # 实例化数据生成对象 dataGen = DataGenerator(fileName, config) dataGen.gen_attr() # 生成训练集和测试集 # 下列两个数组的形式是:[ [[知识点id,答题结果], [知识点id,答题结果], ...], [[],[],[],...], [[],[],[]],... ] # 例如train_seqs有3384个元组,每个元组是某个学生的做题序列:[[知识点id,答题结果], [知识点id,答题结果], ...] train_seqs = dataGen.train_seqs # length: 3384 test_seqs = dataGen.test_seqs # length: 843 session_conf = tf.ConfigProto( allow_soft_placement=True, log_device_placement=False ) sess = tf.Session(config=session_conf) self.sess = sess with sess.as_default(): # 实例化dkt模型对象 with tf.name_scope("train"): with tf.variable_scope("dkt", reuse=None): # train_dkt: 一个TensorFlowDKT模型 train_dkt = TensorFlowDKT(config) with tf.name_scope("test"): with tf.variable_scope("dkt", reuse=True): test_dkt = TensorFlowDKT(config) self.train_dkt = train_dkt # 一个TensorFlowDKT模型 self.test_dkt = test_dkt # 一个TensorFlowDKT模型 global_step = tf.Variable(0, name="global_step", trainable=False) self.global_step = global_step # <tf.Variable 'global_step:0' shape=( ) dtype=int32_ref> # 定义一个优化器 optimizer = tf.train.AdamOptimizer(config.trainConfig.learning_rate) grads_and_vars = optimizer.compute_gradients(train_dkt.loss) # 误差是train_dkt.loss # 对梯度进行截断,并且加上梯度噪音 grads_and_vars = [(tf.clip_by_norm(g, config.trainConfig.max_grad_norm), v) for g, v in grads_and_vars if g is not None] # 定义图中最后的节点 train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step, name="train_op") # 保存各种变量或结果的值,保存到文件中 grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("writing to {}".format(out_dir)) # 训练时的 Summaries train_loss_summary = tf.summary.scalar("loss", train_dkt.loss) train_summary_op = tf.summary.merge([train_loss_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) # 测试时的 summaries test_loss_summary = tf.summary.scalar("loss", test_dkt.loss) dev_summary_op = tf.summary.merge([test_loss_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) saver = tf.train.Saver(tf.global_variables()) sess.run(tf.global_variables_initializer()) print("初始化完毕,开始训练") for i in range(config.trainConfig.epochs): np.random.shuffle(train_seqs) for params in dataGen.next_batch(train_seqs): # 批次获得训练集,训练模型 # params是一个map,包含的元素有(举个例子:) # 1) input_x --> shape:(32, 1109, 248), 32个学生,做题序列长度为1109,每个序列包含124个知识点,用one-hot表示 # 2) target_id --> shape:(32, 1109), 32个学生,表示学生做题的序号,后面用0来填充:[idx1, idx2 ,...,0,0,0] # 3) target_correctness --> 表示target_id是否作对 # 4) seq_len: shape:(32),32个学生的做题序列长度 # 5) max_len: shape:(), seq_len的最大值:1109 # 开始训练:输入params, self.train_step(params, train_op, train_summary_op, train_summary_writer) current_step = tf.train.global_step(sess, global_step) # 对结果进行记录 if current_step % config.trainConfig.evaluate_every == 0: print("\nEvaluation:") # 获得测试数据 losses = [] accuracys = [] aucs = [] precisions = [] recalls = [] for params in dataGen.next_batch(test_seqs): loss, accuracy, auc, precision, recall = self.dev_step(params, dev_summary_op, writer=None) losses.append(loss) accuracys.append(accuracy) aucs.append(auc) precisions.append(precision) recalls.append(recall) time_str = datetime.datetime.now().isoformat() print("dev: {}, step: {}, loss: {}, acc: {}, auc: {}, precision: {}, recall: {}". format(time_str, current_step, mean(losses), mean(accuracys), mean(aucs), mean(precisions), mean(recalls))) if current_step % config.trainConfig.checkpoint_every == 0: path = saver.save(sess, "model/my-model", global_step=current_step) print("Saved model checkpoint to {}\n".format(path))
policy_gradient_loss = tf.reduce_mean( tf.stop_gradient(final_loss_per_sample - baseline) * log_seq_prob) total_training_loss = policy_gradient_loss + avg_sample_loss total_loss = tf.add_n([ total_training_loss, lambda_entropy * nmn3_model.entropy_reg, weight_decay * nmn3_model.l2_reg ]) # Train with Adam solver = tf.train.AdamOptimizer() gradients = solver.compute_gradients(total_loss) # Clip gradient by L2 norm # gradients = gradients_part1+gradients_part2 gradients = [(tf.clip_by_norm(g, max_grad_l2_norm), v) for g, v in gradients] solver_op = solver.apply_gradients(gradients) # Training operation # Partial-run can't fetch training operations # some workaround to make partial-run work with tf.control_dependencies([solver_op, baseline_update_op]): train_step = tf.constant(0) # Write summary to TensorBoard os.makedirs(log_dir, exist_ok=True) log_writer = tf.summary.FileWriter(log_dir, tf.get_default_graph()) loss_ph = tf.placeholder(tf.float32, []) entropy_ph = tf.placeholder(tf.float32, []) accuracy_ph = tf.placeholder(tf.float32, []) baseline_ph = tf.placeholder(tf.float32, [])
def get_train_ops(loss, tf_variables, train_step, clip_mode=None, grad_bound=None, l2_reg=1e-4, lr_warmup_val=None, lr_warmup_steps=100, lr_init=0.1, lr_dec_start=0, lr_dec_every=10000, lr_dec_rate=0.1, lr_dec_min=None, lr_cosine=False, lr_max=None, lr_min=None, lr_T_0=None, lr_T_mul=None, num_train_batches=None, optim_algo=None, sync_replicas=False, num_aggregate=None, num_replicas=None, get_grad_norms=False, moving_average=None): """ Args: clip_mode: "global", "norm", or None. moving_average: store the moving average of parameters """ if l2_reg > 0: l2_losses = [] for var in tf_variables: l2_losses.append(tf.reduce_sum(var**2)) l2_loss = tf.add_n(l2_losses) loss += l2_reg * l2_loss # loss = loss + 1e-4*l2_loss grads = tf.gradients(loss, tf_variables) grad_norm = tf.global_norm(grads) grad_norms = {} for v, g in zip(tf_variables, grads): if v is None or g is None: continue if isinstance(g, tf.IndexedSlices): grad_norms[v.name] = tf.sqrt(tf.reduce_sum(g.values**2)) else: grad_norms[v.name] = tf.sqrt(tf.reduce_sum(g**2)) if clip_mode is not None: assert grad_bound is not None, "Need grad_bound to clip gradients." if clip_mode == "global": grads, _ = tf.clip_by_global_norm(grads, grad_bound) elif clip_mode == "norm": clipped = [] for g in grads: if isinstance(g, tf.IndexedSlices): c_g = tf.clip_by_norm(g.values, grad_bound) c_g = tf.IndexedSlices(g.indices, c_g) else: c_g = tf.clip_by_norm(g, grad_bound) clipped.append(g) grads = clipped else: raise NotImplementedError("Unknown clip_mode {}".format(clip_mode)) if lr_cosine: assert lr_max is not None, "Need lr_max to use lr_cosine" assert lr_min is not None, "Need lr_min to use lr_cosine" assert lr_T_0 is not None, "Need lr_T_0 to use lr_cosine" assert lr_T_mul is not None, "Need lr_T_mul to use lr_cosine" assert num_train_batches is not None, ("Need num_train_batches to use" " lr_cosine") curr_epoch = train_step // num_train_batches # train step will be calculated by just one batch! last_reset = tf.Variable(0, dtype=tf.int32, trainable=False, name="last_reset") T_i = tf.Variable(lr_T_0, dtype=tf.int32, trainable=False, name="T_i") T_curr = curr_epoch - last_reset def _update(): update_last_reset = tf.assign(last_reset, curr_epoch, use_locking=True) update_T_i = tf.assign(T_i, T_i * lr_T_mul, use_locking=True) with tf.control_dependencies([update_last_reset, update_T_i]): rate = tf.to_float(T_curr) / tf.to_float(T_i) * 3.1415926 lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + tf.cos(rate)) return lr def _no_update(): rate = tf.to_float(T_curr) / tf.to_float(T_i) * 3.1415926 lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + tf.cos(rate)) return lr learning_rate = tf.cond(tf.greater_equal(T_curr, T_i), _update, _no_update) else: learning_rate = tf.train.exponential_decay( lr_init, tf.maximum(train_step - lr_dec_start, 0), lr_dec_every, lr_dec_rate, staircase=True) if lr_dec_min is not None: learning_rate = tf.maximum(learning_rate, lr_dec_min) if lr_warmup_val is not None: learning_rate = tf.cond(tf.less(train_step, lr_warmup_steps), lambda: lr_warmup_val, lambda: learning_rate) if optim_algo == "momentum": opt = tf.train.MomentumOptimizer(learning_rate, 0.9, use_locking=True, use_nesterov=True) elif optim_algo == "sgd": opt = tf.train.GradientDescentOptimizer(learning_rate, use_locking=True) elif optim_algo == "adam": opt = tf.train.AdamOptimizer(learning_rate, beta1=0.0, epsilon=1e-3, use_locking=True) else: raise ValueError("Unknown optim_algo {}".format(optim_algo)) if sync_replicas: assert num_aggregate is not None, "Need num_aggregate to sync." assert num_replicas is not None, "Need num_replicas to sync." opt = tf.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=num_aggregate, total_num_replicas=num_replicas, use_locking=True) if moving_average is not None: opt = tf.contrib.opt.MovingAverageOptimizer( opt, average_decay=moving_average) train_op = opt.apply_gradients(zip(grads, tf_variables), global_step=train_step) if get_grad_norms: return train_op, learning_rate, grad_norm, opt, grad_norms else: return train_op, learning_rate, grad_norm, opt
def model_fn(features, labels, mode, params): """The model_fn to be used with TPUEstimator. Args: features: A dict of `Tensor` of batched images and other features. labels: a Tensor or a dict of Tensor representing the batched labels. mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ logging.info('params=%s', params) images = features['image'] if isinstance(features, dict) else features labels = labels['label'] if isinstance(labels, dict) else labels config = params['config'] image_size = params['image_size'] utils.scalar('model/resolution', image_size) if config.model.data_format == 'channels_first': images = tf.transpose(images, [0, 3, 1, 2]) is_training = (mode == tf.estimator.ModeKeys.TRAIN) has_moving_average_decay = (config.train.ema_decay > 0) if FLAGS.use_tpu and not config.model.bn_type: config.model.bn_type = 'tpu_bn' # This is essential, if using a keras-derived model. tf.keras.backend.set_learning_phase(is_training) def build_model(in_images): """Build model using the model_name given through the command line.""" config.model.num_classes = config.data.num_classes model = effnetv2_model.EffNetV2Model(config.model.model_name, config.model) logits = model(in_images, training=is_training)[0] return logits pre_num_params, pre_num_flops = utils.num_params_flops( readable_format=True) if config.runtime.mixed_precision: precision = 'mixed_bfloat16' if FLAGS.use_tpu else 'mixed_float16' logits = utils.build_model_with_precision(precision, build_model, images, is_training) logits = tf.cast(logits, tf.float32) else: logits = build_model(images) num_params, num_flops = utils.num_params_flops(readable_format=True) num_params = num_params - pre_num_params num_flops = (num_flops - pre_num_flops) / params['batch_size'] logging.info('backbone params/flops = %.4f M / %.4f B', num_params, num_flops) utils.scalar('model/params', num_params) utils.scalar('model/flops', num_flops) # Calculate loss, which includes softmax cross entropy and L2 regularization. if config.train.loss_type == 'sigmoid': cross_entropy = tf.losses.sigmoid_cross_entropy( multi_class_labels=tf.cast(labels, dtype=logits.dtype), logits=logits, label_smoothing=config.train.label_smoothing) elif config.train.loss_type == 'custom': xent = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast( labels, dtype=logits.dtype), logits=logits) cross_entropy = tf.reduce_mean(tf.reduce_sum(xent, axis=-1)) else: if config.data.multiclass: logging.info('use multi-class loss: %s', config.data.multiclass) labels /= tf.reshape(tf.reduce_sum(labels, axis=1), (-1, 1)) cross_entropy = tf.losses.softmax_cross_entropy( onehot_labels=labels, logits=logits, label_smoothing=config.train.label_smoothing) train_steps = max(config.train.min_steps, config.train.epochs * params['steps_per_epoch']) global_step = tf.train.get_global_step() weight_decay_inc = config.train.weight_decay_inc * ( tf.cast(global_step, tf.float32) / tf.cast(train_steps, tf.float32)) weight_decay = (1 + weight_decay_inc) * config.train.weight_decay utils.scalar('train/weight_decay', weight_decay) # Add weight decay to the loss for non-batch-normalization variables. matcher = re.compile(config.train.weight_decay_exclude) l2loss = weight_decay * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if not matcher.match(v.name) ]) loss = cross_entropy + l2loss utils.scalar('loss/l2reg', l2loss) utils.scalar('loss/xent', cross_entropy) if has_moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=config.train.ema_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() host_call = None restore_vars_dict = None if is_training: # Compute the current epoch and associated learning rate from global_step. current_epoch = (tf.cast(global_step, tf.float32) / params['steps_per_epoch']) utils.scalar('train/epoch', current_epoch) scaled_lr = config.train.lr_base * (config.train.batch_size / 256.0) scaled_lr_min = config.train.lr_min * (config.train.batch_size / 256.0) learning_rate = utils.WarmupLearningRateSchedule( scaled_lr, steps_per_epoch=params['steps_per_epoch'], decay_epochs=config.train.lr_decay_epoch, warmup_epochs=config.train.lr_warmup_epoch, decay_factor=config.train.lr_decay_factor, lr_decay_type=config.train.lr_sched, total_steps=train_steps, minimal_lr=scaled_lr_min)(global_step) utils.scalar('train/lr', learning_rate) optimizer = utils.build_optimizer( learning_rate, optimizer_name=config.train.optimizer) if FLAGS.use_tpu: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tf.tpu.CrossShardOptimizer(optimizer) # filter trainable variables if needed. var_list = tf.trainable_variables() if config.train.varsexp: vars2 = [ v for v in var_list if re.match(config.train.varsexp, v.name) ] if len(vars2) == len(var_list): logging.warning('%s has no match.', config.train.freeze) logging.info('Filter variables: orig=%d, final=%d, delta=%d', len(var_list), len(vars2), len(var_list) - len(vars2)) var_list = vars2 # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if config.train.gclip and is_training: logging.info('clip gradients norm by %f', config.train.gclip) grads_and_vars = optimizer.compute_gradients(loss, var_list) with tf.name_scope('gclip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] utils.scalar('train/gnorm', tf.linalg.global_norm(grads)) utils.scalar('train/gnormmax', tf.math.reduce_max([tf.norm(g) for g in grads])) # First clip each variable's norm, then clip global norm. clip_norm = abs(config.train.gclip) clipped_grads = [ tf.clip_by_norm(g, clip_norm) if g is not None else None for g in grads ] clipped_grads, _ = tf.clip_by_global_norm( clipped_grads, clip_norm) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step, var_list=var_list) if has_moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) if not config.runtime.skip_host_call: host_call = utils.get_tpu_host_call( global_step, FLAGS.model_dir, config.runtime.iterations_per_loop) else: train_op = None if has_moving_average_decay: # Load moving average variables for eval. restore_vars_dict = ema.variables_to_restore(ema_vars) eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch, num_classes]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ metrics = {} if config.data.multiclass: metrics['eval/global_ap'] = tf.metrics.auc( labels, tf.nn.sigmoid(logits), curve='PR', num_thresholds=200, summation_method='careful_interpolation', name='global_ap') # Convert labels to set: be careful, tf.metrics.xx_at_k are horrible. labels = tf.cast(labels, dtype=tf.int64) label_to_repeat = tf.expand_dims(tf.argmax(labels, axis=-1), axis=-1) all_labels_set = tf.range(0, labels.shape[-1], dtype=tf.int64) all_labels_set = tf.expand_dims(all_labels_set, axis=0) labels_set = labels * all_labels_set + ( 1 - labels) * label_to_repeat metrics['eval/precision@1'] = tf.metrics.precision_at_k( labels_set, logits, k=1) metrics['eval/recall@1'] = tf.metrics.recall_at_k(labels_set, logits, k=1) metrics['eval/precision@5'] = tf.metrics.precision_at_k( labels_set, logits, k=5) metrics['eval/recall@5'] = tf.metrics.recall_at_k(labels_set, logits, k=5) # always add accuracy. labels = tf.argmax(labels, axis=1) predictions = tf.argmax(logits, axis=1) metrics['eval/acc_top1'] = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) metrics['eval/acc_top5'] = tf.metrics.mean(in_top_5) metrics['model/resolution'] = tf.metrics.mean(image_size) metrics['model/flops'] = tf.metrics.mean(num_flops) metrics['model/params'] = tf.metrics.mean(num_params) return metrics eval_metrics = (metric_fn, [labels, logits]) if has_moving_average_decay and not is_training: def scaffold_fn(): # read ema for eval jobs. saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) elif config.train.ft_init_ckpt and is_training: def scaffold_fn(): logging.info('restore variables from %s', config.train.ft_init_ckpt) var_map = utils.get_ckpt_var_map( ckpt_path=config.train.ft_init_ckpt, skip_mismatch=True, init_ema=config.train.ft_init_ema) tf.train.init_from_checkpoint(config.train.ft_init_ckpt, var_map) return tf.train.Scaffold() else: scaffold_fn = None return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def clip_if_not_none(grad, clip_norm=5.): """Clip the gradient only if not None.""" if grad is None: return grad return tf.clip_by_norm(grad, clip_norm)
def gradient_clipping(optimizer, computed_loss, learning_rate, beta1): grads = optimizer.compute_gradients(computed_loss) clipped_grads = [(tf.clip_by_norm(grad, 5), var) for grads, var in grads] return optimizer.apply_gradients(clipped_grads)