def grad_variance(self): grad_var_ops = [] tensor_to_avg = [] for t, g in zip(self._tvars, self._grads): if isinstance(g, ops.IndexedSlices): tensor_to_avg.append( tf.reshape(tf.unsorted_segment_sum(g.values, g.indices, g.dense_shape[0]), shape=t.get_shape())) else: tensor_to_avg.append(g) avg_op = self._moving_averager.apply(tensor_to_avg) grad_var_ops.append(avg_op) with tf.control_dependencies([avg_op]): self._grad_avg = [ self._moving_averager.average(val) for val in tensor_to_avg ] self._grad_avg_squared = [tf.square(val) for val in self._grad_avg] self._grad_var = tf.maximum( tf.constant(EPS, dtype=self._grad_norm_squared_avg.dtype), self._grad_norm_squared_avg - tf.add_n([tf.reduce_sum(val) for val in self._grad_avg_squared])) if self._sparsity_debias: self._grad_var *= self._sparsity_avg return grad_var_ops
def grad_sparsity(self): # If the sparse minibatch gradient has 10 percent of its entries # non-zero, its sparsity is 0.1. # The norm of dense gradient averaged from full dataset # are roughly estimated norm of minibatch # sparse gradient norm * sqrt(sparsity) # An extension maybe only correct the sparse blob. non_zero_cnt = tf.add_n([tf.count_nonzero(g) for g in self._grads]) all_entry_cnt = tf.add_n([tf.size(g) for g in self._grads]) self._sparsity = tf.cast(non_zero_cnt, self._grads[0].dtype) \ / tf.cast(all_entry_cnt, self._grads[0].dtype) avg_op = self._moving_averager.apply([ self._sparsity, ]) with tf.control_dependencies([avg_op]): self._sparsity_avg = self._moving_averager.average(self._sparsity) return avg_op
def before_apply(self): self._moving_averager = tf.train.ExponentialMovingAverage( decay=self._beta, zero_debias=self._zero_debias) assert self._grads is not None and len(self._grads) > 0 before_apply_ops = [] # get per var g**2 and norm**2 self._grad_squared = [] self._grad_norm_squared = [] for v, g in zip(self._tvars, self._grads): if g is None: continue with ops.colocate_with(v): self._grad_squared.append(tf.square(g)) self._grad_norm_squared = [ tf.reduce_sum(grad_squared) for grad_squared in self._grad_squared ] if self._sparsity_debias: avg_op_sparsity = self.grad_sparsity() before_apply_ops.append(avg_op_sparsity) # the following running average on squared norm of gradient is shared # by `grad_variance` and `dist_to_opt` avg_op = self._moving_averager.apply(self._grad_norm_squared) with tf.control_dependencies([avg_op]): self._grad_norm_squared_avg = [ self._moving_averager.average(val) for val in self._grad_norm_squared ] self._grad_norm_squared = tf.add_n(self._grad_norm_squared) self._grad_norm_squared_avg = tf.add_n(self._grad_norm_squared_avg) before_apply_ops.append(avg_op) with tf.control_dependencies([avg_op]): curv_range_ops = self.curvature_range() before_apply_ops += curv_range_ops grad_var_ops = self.grad_variance() before_apply_ops += grad_var_ops dist_to_opt_ops = self.dist_to_opt() before_apply_ops += dist_to_opt_ops return tf.group(*before_apply_ops)
def l2_loss(self, tvars=None): _l2_loss = 0.0 weight_decay = self.config['solver']['optimizer'].get( 'weight_decay', None) if weight_decay: logging.info(f"add L2 Loss with decay: {weight_decay}") with tf.name_scope('l2_loss'): tvars = tvars if tvars else tf.trainable_variables() tvars = [v for v in tvars if 'bias' not in v.name] _l2_loss = weight_decay * tf.add_n( [tf.nn.l2_loss(v) for v in tvars]) summary_lib.scalar('l2_loss', _l2_loss) return _l2_loss
def build(self, mode: str): """Build the model for training, eval and infer.""" inputs = self.input_fn(mode) logging.info("build input data done...") model = self.model_fn() training = mode == utils.TRAIN model.logits = model(inputs["input_x_dict"], training=training) model.input_x_len = inputs["input_x_len"] model.iterator = inputs["iterator"] model.input_x_dict = inputs["input_x_dict"] model.input_x_len = inputs["input_x_len"] model.loss_fn = self.get_loss_fn() if mode != utils.INFER or not self.infer_no_label: input_y = inputs["input_y_dict"]["input_y"] if isinstance(model.loss_fn, list): model.loss = [] for i, one_loss_fn in enumerate(model.loss_fn): one_loss = one_loss_fn( labels=input_y[i], logits=model.logits[i], input_length=model.input_x_len, model=model, name="loss_{}".format(i)) model.loss.append(one_loss) model.loss_op = tf.add_n(model.loss, name="loss_sum") else: model.loss = model.loss_fn( labels=input_y, logits=model.logits, input_length=model.input_x_len, model=model, name="loss") model.loss_op = model.loss logging.info("model.loss done") model.input_y = input_y # output related self.build_output(model) return model