def _get_optimizer(self, name): from tensorpack.tfutils import optimizer from tensorpack.tfutils.gradproc import SummaryGradient, GlobalNormClip, MapGradient init_lr = INIT_LEARNING_RATE_A if name == 'actor' else INIT_LEARNING_RATE_C import tensorpack.tfutils.symbolic_functions as symbf lr = symbf.get_scalar_var('learning_rate/' + name, init_lr, summary=True) opt = tf.train.AdamOptimizer(lr) logger.info("create opt {}".format(name)) if name == 'critic': gradprocs = [ MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.05), regex='^critic/.*') ] elif name == 'actor': gradprocs = [ MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1), regex='^actor/.*') ] else: assert (0) gradprocs.append(SummaryGradient()) opt = optimizer.apply_grad_processors(opt, gradprocs) return opt
def clip_vars(self, params): for W in self.weights: W = tf.clip_by_average_norm(W, params['REG_STRENGTH']) for b in self.biases: b = tf.clip_by_average_norm(b, params['REG_STRENGTH']) self.W_fc = tf.clip_by_average_norm(self.W_fc, params['REG_STRENGTH']) self.b_fc = tf.clip_by_average_norm(self.b_fc, params['REG_STRENGTH'])
def _get_opt(name, init_lr): lr = symbf.get_scalar_var('learning_rate/'+name, init_lr, summary=True) opt = tf.train.AdamOptimizer(lr) logger.info("create opt {}".format(name)) gradprocs = [ # MapGradient(lambda grad: tf.Print(grad, [grad], 'grad {}='.format(grad.op.name), summarize=4)), MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1), regex='^actor/.*'), MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.05), regex='^critic/.*'), # GlobalNormClip(40.), SummaryGradient(), ] opt = optimizer.apply_grad_processors(opt, gradprocs) return opt
def _optimize(self, loss): gradients = tf.gradients(loss, self.train_vars) gradients, use_tran_vars = zip(*filter(lambda g: g[0] is not None, (zip(gradients, self.train_vars)))) clipped_gs = [tf.clip_by_average_norm(g, self.grad_clip) for g in gradients] train_op = self.optimizer.apply_gradients(zip(clipped_gs, use_tran_vars)) gradients = tf.gradients(loss, self.train_vars_no_embedding) gradients, use_tran_vars_no_embedding = zip( *filter(lambda g: g[0] is not None, (zip(gradients, self.train_vars)))) clipped_gs = [tf.clip_by_average_norm(g, self.grad_clip) for g in gradients] train_op_no_embedding = self.optimizer.apply_gradients(zip(clipped_gs, use_tran_vars_no_embedding)) return train_op, train_op_no_embedding
def regularize(output, weights, W_fc, biases, b_fc, params, sess): with sess.as_default(): # if j == 0: # l2_loss = tf.div(tf.sqrt(tf.nn.l2_loss(weights[0])), tf.convert_to_tensor(2.0)).eval() # output.write('l2 loss is %g\n' %l2_loss) check_l2 = tf.reduce_sum(weights[0]).eval() for W in weights: W = tf.clip_by_average_norm(W, params['L2_NORM_CONSTRAINT']) for b in biases: b = tf.clip_by_average_norm(b, params['L2_NORM_CONSTRAINT']) W_fc = tf.clip_by_average_norm(W_fc, params['L2_NORM_CONSTRAINT']) b_fc = tf.clip_by_average_norm(b_fc, params['L2_NORM_CONSTRAINT']) if np.asscalar(check_l2) > np.asscalar(tf.reduce_sum(weights[0]).eval()): output.write('weights clipped\n') return weights, W_fc, biases, b_fc
def setup_gradients(self, prefix, opt, cost): grads = opt.compute_gradients(cost) ret_grads = [] ret_names = [] ret_apply = [] for e in grads: grad, var = e if grad is None or var is None: continue #print "var: %s, gradient: %s" % (var, grad) if self.scope != get_scope_name(var.name): continue pname = get_param_name(var.name) gname = '%s/gradient_%s' % (prefix, pname) print "gradient %s -> %s" % (var, gname) # get all gradients ret_grads.append(grad) ret_names.append(gname) pl = tf.placeholder(tf.float32, shape=var.get_shape(), name=gname) clip = tf.clip_by_average_norm(pl, 1) ret_apply.append((clip, var)) ag = tf.summary.histogram( '%s/%s/apply_%s' % (self.scope, prefix, gname), clip) self.summary_apply_gradients.append(ag) return ret_grads, ret_names, ret_apply
def __init__( self, input_size=INPUT_SIZE, hidden_size=800, rating_scale=10, # 800, tanh = 0.042: GradientDescentOptimizer optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.0001), grad_norm=1, activation=tf.nn.sigmoid): self.inputs = tf.placeholder(tf.float32, [None, input_size]) self.hidden1 = tf.layers.dense(self.inputs, hidden_size, activation=activation) #self.hidden2 = tf.layers.dense(self.hidden1, hidden_size, activation=activation) # self.hidden3 = tf.layers.dense(self.hidden2, hidden_size, activation=activation) # self.hidden4 = tf.layers.dense(self.hidden3, hidden_size, activation=activation) self.prediction_raw = tf.layers.dense(self.hidden1, 1) self.prediction = rating_scale * tf.nn.sigmoid(self.prediction_raw) self.prediction = tf.squeeze(self.prediction, axis=1) self.actual_rating = tf.placeholder(tf.float32, [ None, ]) self.loss = tf.losses.absolute_difference(labels=self.actual_rating, predictions=self.prediction) model_variables = tf.trainable_variables() gradients = optimizer.compute_gradients(self.loss, model_variables) clipped_gradients = [(tf.clip_by_average_norm(gradient, grad_norm), variable) for gradient, variable in gradients] #self.train = optimizer.apply_gradients(clipped_gradients)# self.train = optimizer.minimize(self.loss)
def optimizer(self): lr = tf.get_variable('learning_rate', initializer=1e-4, trainable=False) opt = tf.train.AdamOptimizer(lr) gradprocs = [MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.3))] # SummaryGradient()] opt = optimizer.apply_grad_processors(opt, gradprocs) return opt
def _create_train_op(self): self.opt = tf.train.RMSPropOptimizer( learning_rate=self.var_learning_rate, decay=Config.RMSPROP_DECAY, momentum=Config.RMSPROP_MOMENTUM, epsilon=Config.RMSPROP_EPSILON) self.log_likelihood_cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=self.logits_p, labels=self.action_index)) train_op = self.opt.minimize(self.log_likelihood_cost, global_step=self.global_step) train_ops = [train_op] + self.extra_train_ops self.train_op = tf.group(*train_ops) # for the case of reinforcement learning self.selected_action_prob = tf.reduce_sum(self.softmax_p * self.action_index, axis=1) self.rl_cost = - tf.log(tf.maximum(self.selected_action_prob, self.log_epsilon)) * self.y_r self.rl_cost = tf.reduce_sum(self.rl_cost, axis=0) self.opt_grad = self.opt.compute_gradients(self.rl_cost) self.opt_grad_clipped = [(tf.clip_by_average_norm(g, Config.GRAD_CLIP_NORM),v) for g,v in self.opt_grad] train_rl_op = self.opt.apply_gradients(self.opt_grad_clipped) #train_rl_op = self.opt.minimize(self.rl_cost, global_step=self.global_step) train_rl_ops = [train_rl_op] + self.extra_train_ops self.train_rl_op = tf.group(*train_rl_ops)
def average_gradients(self, tower_grads): average_grads = [] for grad_and_vars in zip(*tower_grads): # Note that each grad_and_vars looks like the following: # ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN)) grads = [] # Average over the 'tower' dimension. g, _ = grad_and_vars[0] for g, _ in grad_and_vars: expanded_g = tf.expand_dims(g, 0) grads.append(expanded_g) grad = tf.concat(grads, axis=0) grad = tf.reduce_mean(grad, 0) # Keep in mind that the Variables are redundant because they are shared # across towers. So .. we will just return the first tower's pointer to # the Variable. v = grad_and_vars[0][1] grad_and_var = (grad, v) average_grads.append(grad_and_var) # clip if self.cfg.clip_gradient: gradients, variables = zip(*average_grads) gradients = [ None if gradient is None else tf.clip_by_average_norm( gradient, self.cfg.clip_gradient_value) for gradient in gradients ] average_grads = zip(gradients, variables) return average_grads
def regularize(output, weights, W_fc, biases, b_fc, params, sess): with sess.as_default(): # if j == 0: # l2_loss = tf.div(tf.sqrt(tf.nn.l2_loss(weights[0])), tf.convert_to_tensor(2.0)).eval() # output.write('l2 loss is %g\n' %l2_loss) check_l2 = tf.reduce_sum(weights[0]).eval() for W in weights: W = tf.clip_by_average_norm(W, params['L2_NORM_CONSTRAINT']) for b in biases: b = tf.clip_by_average_norm(b, params['L2_NORM_CONSTRAINT']) W_fc = tf.clip_by_average_norm(W_fc, params['L2_NORM_CONSTRAINT']) b_fc = tf.clip_by_average_norm(b_fc, params['L2_NORM_CONSTRAINT']) if np.asscalar(check_l2) > np.asscalar( tf.reduce_sum(weights[0]).eval()): output.write('weights clipped\n') return weights, W_fc, biases, b_fc
def optimizer(self): opt = tf.train.AdamOptimizer(self.cfg.learning_rate) return optimizer.apply_grad_processors(opt, [ gradproc.MapGradient( lambda grad: tf.clip_by_average_norm(grad, 0.3)), gradproc.SummaryGradient() ])
def clip_func(grads): clipped_grads = [] for g, v in grads: if g is None: # Choosing not to add gradients to list if they're None. Both adding/not adding are valid choices. # clipped_grads.append((None, v)) continue if not v.trainable: continue if clip_type in ['none', 'None']: pass elif clip_type == 'value': g = tf.clip_by_value(g, clip_bounds[0], clip_bounds[1]) elif clip_type == 'norm': g = tf.clip_by_norm(g, clip_bounds) elif clip_type == 'global_norm': g = tf.clip_by_global_norm(g, clip_bounds) elif clip_type == 'average_norm': g = tf.clip_by_average_norm(g, clip_bounds) else: raise ValueError( "Unrecognized gradient clipping method: {}.".format( clip_type)) clipped_grads.append((g, v)) return clipped_grads
def __init__(self, nA, learning_rate,decay,grad_clip,entropy_beta, state_shape=[84,84,4], master=None, device_name='/gpu:0', scope_name='master'): with tf.device(device_name) : self.state = tf.placeholder(tf.float32,[None]+state_shape) block, self.scope = ActorCritic._build_shared_block(self.state,scope_name) self.policy, self.log_softmax_policy = ActorCritic._build_policy(block,nA,scope_name) self.value = ActorCritic._build_value(block,scope_name) self.train_vars = sorted(tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope.name), key=lambda v:v.name) if( master is not None ) : self.sync_op= self._sync_op(master) self.action = tf.placeholder(tf.int32,[None,]) self.target_value = tf.placeholder(tf.float32,[None,]) advantage = self.target_value - self.value entropy = tf.reduce_sum(-1. * self.policy * self.log_softmax_policy,axis=1) log_p_s_a = tf.reduce_sum(self.log_softmax_policy * tf.one_hot(self.action,nA),axis=1) self.policy_loss = tf.reduce_mean(tf.stop_gradient(advantage)*log_p_s_a) self.entropy_loss = tf.reduce_mean(entropy) self.value_loss = tf.reduce_mean(advantage**2) loss = -self.policy_loss - entropy_beta* self.entropy_loss + self.value_loss self.gradients = tf.gradients(loss,self.train_vars) clipped_gs = [tf.clip_by_average_norm(g,grad_clip) for g in self.gradients] self.train_op = master.optimizer.apply_gradients(zip(clipped_gs,master.train_vars)) else : #self.optimizer = tf.train.AdamOptimizer(learning_rate,beta1=BETA) self.optimizer = tf.train.RMSPropOptimizer(learning_rate,decay=decay,use_locking=True)
def _create_graph(self): self.x = tf.placeholder( tf.float32, [None, self.img_height, self.img_width, self.img_channels], name='X') self.y = tf.placeholder(tf.float32, [None, self.num_actions], name='Y') self.var_beta = tf.placeholder(tf.float32, name='beta', shape=[]) self.var_learning_rate = tf.placeholder(tf.float32, name='lr', shape=[]) self.global_step = tf.Variable(0, trainable=False, name='step') # As implemented in A3C paper self.n1 = self.conv2d_layer(self.x, 8, 32, 'conv11', strides=[1, 4, 4, 1]) self.n2 = self.conv2d_layer(self.n1, 4, 64, 'conv12', strides=[1, 2, 2, 1]) self.n3 = self.conv2d_layer(self.n2, 3, 64, 'conv13', strides=[1, 1, 1, 1]) self.action_index = tf.placeholder(tf.float32, [None, self.num_actions]) _input = self.n3 flatten_input_shape = _input.get_shape() nb_elements = flatten_input_shape[1] * flatten_input_shape[2] * flatten_input_shape[3] self.flat = tf.reshape(_input, shape=[-1, nb_elements._value]) self.d1 = self.dense_layer(self.flat, 512, 'dense1', func=tf.nn.relu) self.d2 = self.dense_layer(self.d1, self.num_actions, 'logits_p', func=None) #self.logits_v = tf.squeeze(self.dense_layer(self.d1, 1, 'logits_v', func=None), axis=[1]) #self.cost_v = 0.5 * tf.reduce_sum(tf.square(self.y_r - self.logits_v), axis=0) #self.logits_p = self.dense_layer(self.d1, self.num_actions, 'logits_p', func=None) ''' if Config.USE_LOG_SOFTMAX: self.softmax_p = tf.nn.softmax(self.logits_p) self.log_softmax_p = tf.nn.log_softmax(self.logits_p) self.log_selected_action_prob = tf.reduce_sum(self.log_softmax_p * self.action_index, axis=1) self.cost_p_1 = self.log_selected_action_prob * (self.y_r - tf.stop_gradient(self.logits_v)) self.cost_p_2 = -1 * self.var_beta * \ tf.reduce_sum(self.log_softmax_p * self.softmax_p, axis=1) else: self.softmax_p = (tf.nn.softmax(self.logits_p) + Config.MIN_POLICY) / (1.0 + Config.MIN_POLICY * self.num_actions) self.selected_action_prob = tf.reduce_sum(self.softmax_p * self.action_index, axis=1) self.cost_p_1 = tf.log(tf.maximum(self.selected_action_prob, self.log_epsilon)) \ * (self.y_r - tf.stop_gradient(self.logits_v)) self.cost_p_2 = -1 * self.var_beta * \ tf.reduce_sum(tf.log(tf.maximum(self.softmax_p, self.log_epsilon)) * self.softmax_p, axis=1) ''' self.cost_all = tf.losses.mean_squared_error(self.y, self.d2) self.opt = tf.train.RMSPropOptimizer( learning_rate=self.var_learning_rate, decay=Config.RMSPROP_DECAY, momentum=Config.RMSPROP_MOMENTUM, epsilon=Config.RMSPROP_EPSILON) if Config.USE_GRAD_CLIP: self.opt_grad = self.opt.compute_gradients(self.cost_all) self.opt_grad_clipped = [(tf.clip_by_average_norm(g, Config.GRAD_CLIP_NORM),v) for g,v in self.opt_grad] self.train_op = self.opt.apply_gradients(self.opt_grad_clipped) else: self.train_op = self.opt.minimize(self.cost_all, global_step=self.global_step)
def _get_optimizer(self): lr = symbf.get_scalar_var('learning_rate', 0.001, summary=True) opt = tf.train.AdamOptimizer(lr, epsilon=1e-3) gradprocs = [MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1)), SummaryGradient()] opt = optimizer.apply_grad_processors(opt, gradprocs) return opt
def _get_opt(name, init_lr): lr = symbf.get_scalar_var('learning_rate/' + name, init_lr, summary=True) opt = tf.train.AdamOptimizer(lr) logger.info("create opt {}".format(name)) gradprocs = [ # MapGradient(lambda grad: tf.Print(grad, [grad], 'grad {}='.format(grad.op.name), summarize=4)), MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1), regex='^actor/.*'), MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.05), regex='^critic/.*'), # GlobalNormClip(40.), SummaryGradient(), ] opt = optimizer.apply_grad_processors(opt, gradprocs) return opt
def optimizer(self): lr = tf.get_variable('learning_rate', initializer=0.001, trainable=False) opt = tf.train.AdamOptimizer(lr, epsilon=1e-3) gradprocs = [MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1)), SummaryGradient()] opt = optimizer.apply_grad_processors(opt, gradprocs) return opt
def optimizer(self): lr = tf.get_variable('learning_rate', initializer=self.learning_rate, trainable=False) # opt = tf.train.AdamOptimizer(lr, epsilon=1e-3) opt = tf.train.AdamOptimizer(lr) return optimizer.apply_grad_processors( opt, [ # gradproc.GlobalNormClip(2.0), gradproc.MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.5)), gradproc.SummaryGradient()])
def _clip_grads(self, grads): if self.clip_norm_type == 'ignore': return grads elif self.clip_norm_type == 'global': return tf.clip_by_global_norm(grads, self.clip_norm)[0] elif self.clip_norm_type == 'avg': return tf.clip_by_average_norm(grads, self.clip_norm)[0] elif self.clip_norm_type == 'local': return [tf.clip_by_norm(g, self.clip_norm) for g in grads]
def __init__(self, nA, learning_rate, decay, grad_clip, entropy_beta, state_shape=[84, 84, 4], master=None, device_name='/gpu:0', scope_name='master'): with tf.device(device_name): self.state = tf.placeholder(tf.float32, [None] + state_shape) block, self.scope = ActorCritic._build_shared_block( self.state, scope_name) self.policy, self.log_softmax_policy = ActorCritic._build_policy( block, nA, scope_name) self.value = ActorCritic._build_value(block, scope_name) self.train_vars = sorted(tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, self.scope.name), key=lambda v: v.name) if (master is not None): self.sync_op = self._sync_op(master) self.action = tf.placeholder(tf.int32, [ None, ]) self.target_value = tf.placeholder(tf.float32, [ None, ]) advantage = self.target_value - self.value entropy = tf.reduce_sum(-1. * self.policy * self.log_softmax_policy, axis=1) log_p_s_a = tf.reduce_sum(self.log_softmax_policy * tf.one_hot(self.action, nA), axis=1) self.policy_loss = tf.reduce_mean( tf.stop_gradient(advantage) * log_p_s_a) self.entropy_loss = tf.reduce_mean(entropy) self.value_loss = tf.reduce_mean(advantage**2) loss = -self.policy_loss - entropy_beta * self.entropy_loss + self.value_loss self.gradients = tf.gradients(loss, self.train_vars) clipped_gs = [ tf.clip_by_average_norm(g, grad_clip) for g in self.gradients ] self.train_op = master.optimizer.apply_gradients( zip(clipped_gs, master.train_vars)) else: #self.optimizer = tf.train.AdamOptimizer(learning_rate,beta1=BETA) self.optimizer = tf.train.RMSPropOptimizer(learning_rate, decay=decay, use_locking=True)
def __train_ops(self): if Config.DUAL_RMSPROP: self.opt_p = tf.train.RMSPropOptimizer( learning_rate=self.var_learning_rate, decay=Config.RMSPROP_DECAY, momentum=Config.RMSPROP_MOMENTUM, epsilon=Config.RMSPROP_EPSILON) self.opt_v = tf.train.RMSPropOptimizer( learning_rate=self.var_learning_rate, decay=Config.RMSPROP_DECAY, momentum=Config.RMSPROP_MOMENTUM, epsilon=Config.RMSPROP_EPSILON) else: self.cost_all = self.cost_p + self.cost_v self.opt = tf.train.RMSPropOptimizer( learning_rate=self.var_learning_rate, decay=Config.RMSPROP_DECAY, momentum=Config.RMSPROP_MOMENTUM, epsilon=Config.RMSPROP_EPSILON) if Config.USE_GRAD_CLIP: if Config.DUAL_RMSPROP: self.opt_grad_v = self.opt_v.compute_gradients(self.cost_v) self.opt_grad_v_clipped = [ (tf.clip_by_norm(g, Config.GRAD_CLIP_NORM), v) for g, v in self.opt_grad_v if not g is None ] self.train_op_v = self.opt_v.apply_gradients( self.opt_grad_v_clipped) self.opt_grad_p = self.opt_p.compute_gradients(self.cost_p) self.opt_grad_p_clipped = [ (tf.clip_by_norm(g, Config.GRAD_CLIP_NORM), v) for g, v in self.opt_grad_p if not g is None ] self.train_op_p = self.opt_p.apply_gradients( self.opt_grad_p_clipped) self.train_op = [self.train_op_p, self.train_op_v] else: self.opt_grad = self.opt.compute_gradients(self.cost_all) self.opt_grad_clipped = [ (tf.clip_by_average_norm(g, Config.GRAD_CLIP_NORM), v) for g, v in self.opt_grad ] self.train_op = self.opt.apply_gradients(self.opt_grad_clipped) else: if Config.DUAL_RMSPROP: self.train_op_v = self.opt_p.minimize( self.cost_v, global_step=self.global_step) self.train_op_p = self.opt_v.minimize( self.cost_p, global_step=self.global_step) self.train_op = [self.train_op_p, self.train_op_v] else: self.train_op = self.opt.minimize(self.cost_all, global_step=self.global_step)
def _compute_current_gradients(self): if GoConfig.USE_GRAD_CLIP: if GoConfig.DUAL_RMSPROP: # value self.variables_to_train_v = self.get_trainable_variables( 'resnet_v2_50, OutputNet/logits_p, global_step') self.opt_grad_v = self.opt_v.compute_gradients( self.cost_v, var_list=self.variables_to_train_v) self.opt_grad_v_clipped = [ (tf.clip_by_norm(g, GoConfig.GRAD_CLIP_NORM), v) for g, v in self.opt_grad_v if not g is None ] self.tower_v_grads.append(self.opt_grad_v_clipped) # policy self.variables_to_train_p = self.get_trainable_variables( 'resnet_v2_50, OutputNet/logits_v, global_step') self.opt_grad_p = self.opt_p.compute_gradients( self.cost_p, var_list=self.variables_to_train_p) self.opt_grad_p_clipped = [ (tf.clip_by_norm(g, GoConfig.GRAD_CLIP_NORM), v) for g, v in self.opt_grad_p if not g is None ] self.tower_p_grads.append(self.opt_grad_p_clipped) else: # all: value + policy self.variables_to_train_all = self.get_trainable_variables( 'resnet_v2_50, global_step') self.opt_grad = self.opt.compute_gradients( self.cost_all, var_list=self.variables_to_train_all) self.opt_grad_clipped = [ (tf.clip_by_average_norm(g, GoConfig.GRAD_CLIP_NORM), v) for g, v in self.opt_grad ] self.tower_all_grads.append(self.opt_grad_clipped) else: if GoConfig.DUAL_RMSPROP: # value self.variables_to_train_v = self.get_trainable_variables( 'resnet_v2_50, OutputNet/logits_p, global_step') self.opt_grad_v = self.opt_v.compute_gradients( self.cost_v, var_list=self.variables_to_train_v) self.tower_v_grads.append(self.opt_grad_v) # policy self.variables_to_train_p = self.get_trainable_variables( 'resnet_v2_50, OutputNet/logits_v, global_step') self.opt_grad_p = self.opt_p.compute_gradients( self.cost_p, var_list=self.variables_to_train_p) self.tower_p_grads.append(self.opt_grad_p) else: # all: value + policy self.variables_to_train_all = self.get_trainable_variables( 'resnet_v2_50, global_step') self.opt_grad = self.opt.compute_gradients( self.cost_all, var_list=self.variables_to_train_all) self.tower_all_grads.append(self.opt_grad)
def optimizer(self): lr = tf.get_variable('learning_rate', initializer=1e-3, trainable=False) # This will also put the summary in tensorboard, stat.json and print in terminal, # but this time without moving average tf.summary.scalar('lr', lr) # opt = tf.train.MomentumOptimizer(lr, 0.9) opt = tf.train.AdamOptimizer(lr) return optimizer.apply_grad_processors( opt, [gradproc.MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.5)), gradproc.SummaryGradient()])
def testClipByAverageNormZero(self): # No norm clipping when average clip_norm = 0 with self.test_session(): x = tf.constant([0.0, 0.0, 0.0, 0.0, 0.0, 0.0], shape=[2, 3]) # Average norm = 0, no changes np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] clip_norm = 0.9 ans = tf.clip_by_average_norm(x, clip_norm) tf_ans = ans.eval() self.assertAllClose(np_ans, tf_ans)
def testClipByAverageNormNotClipped(self): # No norm clipping when average clip_norm >= 0.83333333 with self.test_session(): x = tf.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3]) # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333 np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]] clip_norm = 0.9 ans = tf.clip_by_average_norm(x, clip_norm) tf_ans = ans.eval() self.assertAllClose(np_ans, tf_ans)
def get_gradients(self, loss_or_grads, params): """ Note ---- The returned gradients may contain None value """ # check valid algorithm if self.algorithm is None or \ not hasattr(self.algorithm, 'compute_gradients') or \ not hasattr(self.algorithm, 'apply_gradients'): raise RuntimeError("Optimizer is None, or doesn't has attributes: " "compute_gradients and apply_gradients.") with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE) as scope: scope_name = scope.name # get the gradient grads_var = self.algorithm.compute_gradients(loss_or_grads, var_list=params) grads_var = {g: v for g, v in grads_var if g is not None} grads = list(grads_var.keys()) params = list(grads_var.values()) # ====== clipnorm ====== # if self.clipnorm is not None: if self.clip_alg == 'norm': grads = [tf.clip_by_norm(g, self.clipnorm) for g in grads] elif self.clip_alg == 'total_norm': grads, _ = tf.clip_by_global_norm(grads, self.clipnorm) elif self.clip_alg == 'avg_norm': grads = [tf.clip_by_average_norm(g, self.clipnorm) for g in grads] else: raise ValueError("Unknown norm clipping algorithm: '%s'" % self.clip_alg) # ====== clipvalue ====== # if self.clipvalue is not None: grads = [tf.clip_by_value(g, -self.clipvalue, self.clipvalue) for g in grads] # ====== get final norm value ====== # self._norm = add_roles(tf.global_norm(grads, name="GradientNorm"), GradientsNorm) # ====== setting Optimizer roles ====== # for v in get_all_variables(scope=scope_name): add_roles(v, roles=OptimizerVariable) return [(g, p) for g, p in zip(grads, params)]
def _buildOptimizer(self, learningRate, decay, momentum, epsilon, clipNorm): """ - Creates a graph node for applying reducing the loss (self.applyGrads) learningRate : Learning rate to be applied to gradients decay : Discount for past gradients momentum : Gradient momentum epsilon : non zero offset clipNorm : Maximum average norm gradient allowed """ optimizer = tf.train.RMSPropOptimizer( learning_rate = learningRate , decay = decay , momentum = momentum , epsilon = epsilon ) grads = optimizer.compute_gradients(loss) clippedGrads = [(tf.clip_by_average_norm(grad, clipNorm),var) for grad,var in grads] self.applyGrads = optimizer.apply_gradients(clippedGrads)
def clip_gradients(grads_and_vars): """This method was migrated from GradientClipOptimizer which has been deprecated""" clip_method = th.clip_method bound = th.clip_threshold assert clip_method in ('norm', 'value', 'global_norm', 'avg_norm') if clip_method in ('norm', 'value', 'avg_norm'): if clip_method == 'norm': method = lambda g: tf.clip_by_norm(g, bound) elif clip_method == 'value': method = lambda g: tf.clip_by_value(g, -bound, bound) else: method = lambda g: tf.clip_by_average_norm(g, bound) grads_and_vars = [(method(grad), var) for grad, var in grads_and_vars] else: assert clip_method == 'global_norm' grads = [g for g, _ in grads_and_vars] clipped_grads, _ = tf.clip_by_global_norm(grads, bound) vars_ = [v for _, v in grads_and_vars] grads_and_vars = list(zip(clipped_grads, vars_)) return grads_and_vars
def _compute_gradients(self, loss, var_list=None): # Sanity check assert isinstance(loss, tf.Tensor) # Compute gradients using default method grads_and_vars = self._tf_optimizer.compute_gradients( loss, var_list=var_list) # Deal with NaN if necessary if hub.clip_nan_protection: grads_and_vars = [ (self._deal_with_nan(grad), var) for grad, var in grads_and_vars] # Apply lr decay if necessary lr_decay = hub.clip_lr_multiplier if lr_decay < 1.0: assert lr_decay > 0 grads_and_vars = [(grad * lr_decay, var) for grad, var in grads_and_vars] # Clip gradient if necessary if self._threshold > 0: bound = self._threshold if self._method in ('norm', 'value', 'avg_norm'): if self._method == 'norm': method = lambda g: tf.clip_by_norm(g, bound) elif self._method == 'value': method = lambda g: tf.clip_by_value(g, -bound, bound) else: method = lambda g: tf.clip_by_average_norm(g, bound) grads_and_vars = [(method(grad), var) for grad, var in grads_and_vars] else: assert self._method == 'global_norm' grads = [g for g, _ in grads_and_vars] clipped_grads, _ = tf.clip_by_global_norm(grads, self._threshold) vars_ = [v for _, v in grads_and_vars] grads_and_vars = list(zip(clipped_grads, vars_)) return grads_and_vars
def get_gradient_processor(self): return [MapGradient(lambda grad: tf.clip_by_average_norm(grad, 0.1)), SummaryGradient()]
def _create_graph(self): self.x = tf.placeholder(tf.float32, [None, self.img_height, self.img_width, self.img_channels], name='X') self.odometry = tf.placeholder(tf.float32, [None, 7], name='odometry') self.y_r = tf.placeholder(tf.float32, [None], name='Yr') self.action_index = tf.placeholder(tf.float32, [None, self.num_actions]) self.var_beta = tf.placeholder(tf.float32, name='beta', shape=[]) self.var_learning_rate = tf.placeholder(tf.float32, name='lr', shape=[]) self.global_step = tf.Variable(0, trainable=False, name='step') # As implemented in A3C paper self.n1 = self.conv2d_layer(self.x, 8, 16, 'conv11', strides=[1, 4, 4, 1]) self.n2 = self.conv2d_layer(self.n1, 4, 32, 'conv12', strides=[1, 2, 2, 1]) # _input = self.n2 # flatten_input_shape = _input.get_shape() # nb_elements = flatten_input_shape[1] * flatten_input_shape[2] * flatten_input_shape[3] self.flat = tf.contrib.layers.flatten(self.n2) self.d1 = self.dense_layer(self.flat, 256, 'dense1') self.logits_v = tf.squeeze(self.dense_layer(self.d1, 1, 'logits_v', func=None), axis=[1]) self.cost_v = 0.5 * tf.reduce_sum(tf.square(self.y_r - self.logits_v), axis=0) self.logits_p = self.dense_layer(self.d1, self.num_actions, 'logits_p') if Config.USE_LOG_SOFTMAX: self.softmax_p = tf.nn.softmax(self.logits_p) self.log_softmax_p = tf.nn.log_softmax(self.logits_p) self.log_selected_action_prob = tf.reduce_sum(self.log_softmax_p * self.action_index, axis=1) self.cost_p_1 = self.log_selected_action_prob * (self.y_r - tf.stop_gradient(self.logits_v)) self.cost_p_2 = -1 * self.var_beta * \ tf.reduce_sum(self.log_softmax_p * self.softmax_p, axis=1) else: self.softmax_p = (tf.nn.softmax(self.logits_p) + Config.MIN_POLICY) / (1.0 + Config.MIN_POLICY * self.num_actions) self.selected_action_prob = tf.reduce_sum(self.softmax_p * self.action_index, axis=1) self.cost_p_1 = tf.log(tf.maximum(self.selected_action_prob, self.log_epsilon)) * (self.y_r - tf.stop_gradient(self.logits_v)) self.cost_p_2 = -1 * self.var_beta * tf.reduce_sum(tf.log(tf.maximum(self.softmax_p, self.log_epsilon)) * self.softmax_p, axis=1) self.cost_p_1_agg = tf.reduce_sum(self.cost_p_1, axis=0) self.cost_p_2_agg = tf.reduce_sum(self.cost_p_2, axis=0) self.cost_p = -(self.cost_p_1_agg + self.cost_p_2_agg) if Config.DUAL_RMSPROP: self.opt_p = tf.train.RMSPropOptimizer( learning_rate=self.var_learning_rate, decay=Config.RMSPROP_DECAY, momentum=Config.RMSPROP_MOMENTUM, epsilon=Config.RMSPROP_EPSILON) self.opt_v = tf.train.RMSPropOptimizer( learning_rate=self.var_learning_rate, decay=Config.RMSPROP_DECAY, momentum=Config.RMSPROP_MOMENTUM, epsilon=Config.RMSPROP_EPSILON) else: self.cost_all = self.cost_p + self.cost_v self.opt = tf.train.RMSPropOptimizer( learning_rate=self.var_learning_rate, decay=Config.RMSPROP_DECAY, momentum=Config.RMSPROP_MOMENTUM, epsilon=Config.RMSPROP_EPSILON) if Config.USE_GRAD_CLIP: if Config.DUAL_RMSPROP: self.opt_grad_v = self.opt_v.compute_gradients(self.cost_v) self.opt_grad_v_clipped = [(tf.clip_by_norm(g, Config.GRAD_CLIP_NORM),v) for g,v in self.opt_grad_v if not g is None] self.train_op_v = self.opt_v.apply_gradients(self.opt_grad_v_clipped) self.opt_grad_p = self.opt_p.compute_gradients(self.cost_p) self.opt_grad_p_clipped = [(tf.clip_by_norm(g, Config.GRAD_CLIP_NORM),v) for g,v in self.opt_grad_p if not g is None] self.train_op_p = self.opt_p.apply_gradients(self.opt_grad_p_clipped) self.train_op = [self.train_op_p, self.train_op_v] else: self.opt_grad = self.opt.compute_gradients(self.cost_all) self.opt_grad_clipped = [(tf.clip_by_average_norm(g, Config.GRAD_CLIP_NORM),v) for g,v in self.opt_grad] self.train_op = self.opt.apply_gradients(self.opt_grad_clipped) else: if Config.DUAL_RMSPROP: self.train_op_v = self.opt_p.minimize(self.cost_v, global_step=self.global_step) self.train_op_p = self.opt_v.minimize(self.cost_p, global_step=self.global_step) self.train_op = [self.train_op_p, self.train_op_v] else: self.train_op = self.opt.minimize(self.cost_all, global_step=self.global_step)
nthreads=1, random_crop=config.RANDOM_CROP).data_pipeline(1) static_inpainted_images = model.build_static_infer_graph( static_images, config, name='static_view/%d' % i) # training settings lr = tf.get_variable('lr', shape=[], trainable=False, initializer=tf.constant_initializer(1e-4)) d_optimizer = tf.train.AdamOptimizer(lr, beta1=0.5, beta2=0.9) g_optimizer = d_optimizer # gradient processor if config.GRADIENT_CLIP: gradient_processor = lambda grad_var: (tf.clip_by_average_norm( grad_var[0], config.GRADIENT_CLIP_VALUE), grad_var[1]) else: gradient_processor = None # log dir log_prefix = 'model_logs/' + '_'.join([ ng.date_uid(), socket.gethostname(), config.DATASET, 'MASKED' if config.GAN_WITH_MASK else 'NORMAL', config.GAN, config.LOG_DIR ]) # train discriminator with secondary trainer, should initialize before primary trainer. discriminator_training_callback = ng.callbacks.SecondaryTrainer( pstep=1, optimizer=d_optimizer, var_list=d_vars,
def train(params, output, train_eval_bundle, dev_bundle, batches_x, batches_y, key_array, embed_keys, train_x, train_y): with tf.Graph().as_default(): x, y_, dropout, weights, biases, W_fc, b_fc, log_loss, correct_prediction = define_nn(params) if params['Adagrad']: train_step = tf.train.AdagradOptimizer(params['LEARNING_RATE']).minimize(cross_entropy) else: train_step = tf.train.AdamOptimizer(params['LEARNING_RATE']).minimize(cross_entropy) saver = tf.train.Saver(tf.all_variables()) #run session output.write( 'Initializing session...\n\n') sess = tf.Session(config=tf.ConfigProto(inter_op_parallelism_threads=2, intra_op_parallelism_threads=3, use_per_session_threads=True)) sess.run(tf.initialize_all_variables()) output.write( 'Running session...\n\n') output.write('setup time: %g\n'%(time.clock())) best_dev_accuracy = 0 train_softmax = sum_prob(x, y_, train_eval_bundle, params, log_loss, dropout, sess) initial_accuracy = sum_prob(x, y_, train_eval_bundle, params, correct_prediction, dropout, sess) output.write("initial accuracy %g softmax%g \n"%(initial_accuracy, train_softmax)) output.write('start time: ' + str(time.clock()) + '\n') time_index = time.clock() epoch_time = 0 for i in range(params['EPOCHS']): params['epoch'] = i + 1 for j in range(len(batches_x)): train_step.run(feed_dict={x: batches_x[j], y_: batches_y[j], dropout: params['TRAIN_DROPOUT']}, session = sess) #apply l2 clipping to weights and biases with sess.as_default(): # print weights[0].eval() if j == 0: l2_loss = tf.div(tf.sqrt(tf.nn.l2_loss(weights[0])), tf.convert_to_tensor(2.0)).eval() output.write('l2 loss is %g' %l2_loss) check_l2 = tf.reduce_sum(weights[0]).eval() for W in weights: W = tf.clip_by_average_norm(W, params['L2_NORM_CONSTRAINT']) for b in biases: b = tf.clip_by_average_norm(b, params['L2_NORM_CONSTRAINT']) W_fc = tf.clip_by_average_norm(W_fc, params['L2_NORM_CONSTRAINT']) b_fc = tf.clip_by_average_norm(b_fc, params['L2_NORM_CONSTRAINT']) if np.asscalar(check_l2) > np.asscalar(tf.reduce_sum(weights[0]).eval()): output.write('weights clipped\n') if params['BATCH_SIZE'] == 1: batches_x, batches_y = shuffle_in_unison(batches_x, batches_y) else: batches_x, batches_y = scramble_batches(train_x, train_y, params, embed_keys, train_eval_bundle[2], train_eval_bundle[3]) train_softmax = sum_prob(x, y_, train_eval_bundle, params, log_loss, dropout, sess) train_accuracy = sum_prob(x, y_, train_eval_bundle, params, correct_prediction, dropout, sess) output.write("epoch %d, training accuracy %g, training softmax error %g \n" %(i, train_accuracy, train_softmax)) dev_accuracy = sum_prob(x, y_, dev_bundle, params, correct_prediction, dropout, sess) dev_softmax = sum_prob(x, y_, dev_bundle, params, log_loss, dropout, sess) output.write("dev set accuracy %g, softmax %g \n"%(dev_accuracy, dev_softmax)) if dev_accuracy > best_dev_accuracy: saver.save(sess, 'text_cnn_run' + params['OUTPUT_FILE_NAME'], global_step = params['epoch']) best_dev_accuracy = dev_accuracy if dev_accuracy < best_dev_accuracy - .02: #early stop if accuracy drops significantly break output.write('epoch time : ' + str(time.clock() - time_index)) epoch_time += time.clock() - time_index time_index = time.clock() output.write('. elapsed: ' + str(time.clock()) + '\n') # if params['TEST']: # output.write('Testing:\n') # test_x, test_y = sort_examples_by_length(test_x, test_y) # test_bundle = batch(test_x, test_y, params, embed_keys) + (len(test_y),) # saver.restore # test_accuracy = sum_prob(x, y_, test_bundle, params, correct_prediction, dropout, sess) # output.write('Final test accuracy: %g' %test_accuracy) return epoch_time
def _create_graph(self): self.x = tf.placeholder( tf.float32, [None, self.player_count * self.player_dimension], name='X') self.y_r = tf.placeholder(tf.float32, [None], name='Yr') self.action_index = tf.placeholder(tf.float32, [None, self.num_actions]) self.var_beta = tf.placeholder(tf.float32, name='beta', shape=[]) self.var_learning_rate = tf.placeholder(tf.float32, name='lr', shape=[]) self.global_step = tf.Variable(0, trainable=False, name='step') self.d1 = self.dense_layer(self.x, 128, 'dense1') self.d2 = self.dense_layer(self.d1, 16, 'dense2') self.d3 = self.dense_layer(self.d2, 128, 'dense3') self.logits_v = tf.squeeze(self.dense_layer(self.d3, 1, 'logits_v', func=None), axis=[1]) self.cost_v = 0.5 * tf.reduce_sum(tf.square(self.y_r - self.logits_v), axis=0) self.logits_p = self.dense_layer(self.d3, self.num_actions, 'logits_p', func=None) if Config.USE_LOG_SOFTMAX: self.softmax_p = tf.nn.softmax(self.logits_p) self.log_softmax_p = tf.nn.log_softmax(self.logits_p) self.log_selected_action_prob = tf.reduce_sum(self.log_softmax_p * self.action_index, axis=1) self.cost_p_1 = self.log_selected_action_prob * ( self.y_r - tf.stop_gradient(self.logits_v)) self.cost_p_2 = -1 * self.var_beta * \ tf.reduce_sum(self.log_softmax_p * self.softmax_p, axis=1) else: self.softmax_p = (tf.nn.softmax(self.logits_p) + Config.MIN_POLICY ) / (1.0 + Config.MIN_POLICY * self.num_actions) self.selected_action_prob = tf.reduce_sum(self.softmax_p * self.action_index, axis=1) self.cost_p_1 = tf.log(tf.maximum(self.selected_action_prob, self.log_epsilon)) \ * (self.y_r - tf.stop_gradient(self.logits_v)) self.cost_p_2 = -1 * self.var_beta * \ tf.reduce_sum(tf.log(tf.maximum(self.softmax_p, self.log_epsilon)) * self.softmax_p, axis=1) self.cost_p_1_agg = tf.reduce_sum(self.cost_p_1, axis=0) self.cost_p_2_agg = tf.reduce_sum(self.cost_p_2, axis=0) self.cost_p = -(self.cost_p_1_agg + self.cost_p_2_agg) if Config.DUAL_RMSPROP: self.opt_p = tf.train.RMSPropOptimizer( learning_rate=self.var_learning_rate, decay=Config.RMSPROP_DECAY, momentum=Config.RMSPROP_MOMENTUM, epsilon=Config.RMSPROP_EPSILON) self.opt_v = tf.train.RMSPropOptimizer( learning_rate=self.var_learning_rate, decay=Config.RMSPROP_DECAY, momentum=Config.RMSPROP_MOMENTUM, epsilon=Config.RMSPROP_EPSILON) else: self.cost_all = self.cost_p + self.cost_v self.opt = tf.train.RMSPropOptimizer( learning_rate=self.var_learning_rate, decay=Config.RMSPROP_DECAY, momentum=Config.RMSPROP_MOMENTUM, epsilon=Config.RMSPROP_EPSILON) if Config.USE_GRAD_CLIP: if Config.DUAL_RMSPROP: self.opt_grad_v = self.opt_v.compute_gradients(self.cost_v) self.opt_grad_v_clipped = [ (tf.clip_by_norm(g, Config.GRAD_CLIP_NORM), v) for g, v in self.opt_grad_v if not g is None ] self.train_op_v = self.opt_v.apply_gradients( self.opt_grad_v_clipped) self.opt_grad_p = self.opt_p.compute_gradients(self.cost_p) self.opt_grad_p_clipped = [ (tf.clip_by_norm(g, Config.GRAD_CLIP_NORM), v) for g, v in self.opt_grad_p if not g is None ] self.train_op_p = self.opt_p.apply_gradients( self.opt_grad_p_clipped) self.train_op = [self.train_op_p, self.train_op_v] else: self.opt_grad = self.opt.compute_gradients(self.cost_all) self.opt_grad_clipped = [ (tf.clip_by_average_norm(g, Config.GRAD_CLIP_NORM), v) for g, v in self.opt_grad ] self.train_op = self.opt.apply_gradients(self.opt_grad_clipped) else: if Config.DUAL_RMSPROP: self.train_op_v = self.opt_p.minimize( self.cost_v, global_step=self.global_step) self.train_op_p = self.opt_v.minimize( self.cost_p, global_step=self.global_step) self.train_op = [self.train_op_p, self.train_op_v] else: self.train_op = self.opt.minimize(self.cost_all, global_step=self.global_step)
static_fnames = val_fnames[i:i+1] static_images = ng.data.DataFromFNames( static_fnames, config.IMG_SHAPES, nthreads=1, random_crop=config.RANDOM_CROP, random_flip=config.RANDOM_FLIP).data_pipeline(1) static_inpainted_images = model.build_static_infer_graph( static_images[0], config, name='static_view/%d' % i, exclusionmask=images[exclusionmask_index] if config.EXC_MASKS else None) # training settings lr = tf.get_variable( 'lr', shape=[], trainable=False, initializer=tf.constant_initializer(1e-4)) d_optimizer = tf.train.AdamOptimizer(lr, beta1=0.5, beta2=0.9) g_optimizer = d_optimizer # gradient processor if config.GRADIENT_CLIP: gradient_processor = lambda grad_var: ( tf.clip_by_average_norm(grad_var[0], config.GRADIENT_CLIP_VALUE), grad_var[1]) else: gradient_processor = None # log dir log_prefix = 'model_logs/' + '_'.join([ ng.date_uid(), socket.gethostname(), config.DATASET, 'MASKED' if config.GAN_WITH_MASK else 'NORMAL', config.GAN,config.LOG_DIR]) # train discriminator with secondary trainer, should initialize before # primary trainer. discriminator_training_callback = ng.callbacks.SecondaryTrainer( pstep=1, optimizer=d_optimizer, var_list=d_vars, max_iters=5,
def _post_process_grad(self, grad, var, global_info): """ :param tf.Tensor grad: :param tf.Variable var: :param WrapOptimizer._GetGlobalInfo global_info: :return: new grad, apply grad opts :rtype: tf.Tensor, dict[str] """ updater_opts = self._get_updater_opts_from_var(var) accum_grad_multiple_num_steps = updater_opts.get( "accum_grad_multiple_step", self.config.int("accum_grad_multiple_step", 0)) grad_noise = updater_opts.get("gradient_noise", self.config.float("gradient_noise", 0.0)) grad_clip = updater_opts.get("gradient_clip", self.config.float("gradient_clip", 0.0)) # E.g. https://github.com/openai/baselines/blob/master/baselines/deepq/simple.py: # grad_norm_clipping=10 -> tf.clip_by_norm grad_clip_norm = updater_opts.get("gradient_clip_norm", self.config.float("gradient_clip_norm", 0.0)) grad_clip_avg_norm = updater_opts.get("gradient_clip_avg_norm", self.config.float("gradient_clip_avg_norm", 0.0)) grad_clip_global_norm = updater_opts.get( "gradient_clip_global_norm", self.config.float("gradient_clip_global_norm", 0.0)) global_norm_tag = updater_opts.get( "global_norm_tag", self.config.value("global_norm_tag", None)) grad_clip_global_norm_tag = updater_opts.get( "gradient_clip_global_norm_tag", self.config.value("gradient_clip_global_norm_tag", global_norm_tag)) grad_norm_to_clip_to_zero = updater_opts.get( "grad_norm_to_clip_to_zero", self.config.float("grad_norm_to_clip_to_zero", 0.0)) maximize_grad_norm = updater_opts.get("maximize_grad_norm", self.config.float("maximize_grad_norm", 0)) if maximize_grad_norm: grad_ext = global_info.get_maximize_grad_norm_grad(maximize_grad_norm, var) if grad_ext is not None: grad += grad_ext if accum_grad_multiple_num_steps >= 1: grad = accum_grad_multiple_step( grad, var, train_step=self.global_train_step, num_accum_steps=accum_grad_multiple_num_steps) if updater_opts.get("debug_grad_summaries", self.config.bool_or_other("debug_grad_summaries", False)): from TFUtil import variable_summaries, get_base_name, reuse_name_scope_of_tensor with reuse_name_scope_of_tensor(grad, prefix="grads/"): variable_summaries(grad, name="grad_of_%s" % get_base_name(var)) with reuse_name_scope_of_tensor(var, prefix="vars/"): variable_summaries(var, name=get_base_name(var)) # Also see tf.contrib.layers.optimizers.optimize_loss() for reference. if grad_noise: assert grad_noise > 0 from TFUtil import add_scaled_noise_to_gradients with tf.name_scope("grad_noise"): (grad, var), = add_scaled_noise_to_gradients([(grad, var)], grad_noise) if grad_clip: assert grad_clip > 0 with tf.name_scope("grad_clip"): grad = tf.clip_by_value(grad, -grad_clip, grad_clip) if grad_clip_norm: assert grad_clip_norm > 0 with tf.name_scope("grad_clip_norm"): grad = tf.clip_by_norm(grad, grad_clip_norm) if grad_clip_avg_norm: assert grad_clip_avg_norm > 0 with tf.name_scope("grad_clip_avg_norm"): grad = tf.clip_by_average_norm(grad, grad_clip_avg_norm) if grad_clip_global_norm: assert grad_clip_global_norm > 0 with tf.name_scope("grad_clip_global_norm"): grad = global_info.clip_by_global_norm( grad, clip_norm=grad_clip_global_norm, global_norm_tag=grad_clip_global_norm_tag) if updater_opts.get("gradient_nan_inf_filter", self.config.bool("gradient_nan_inf_filter", False)): from TFUtil import nan_to_num grad = nan_to_num(grad, nan_num=0.0, inf_num=0.0) if grad_norm_to_clip_to_zero: with tf.name_scope("grad_norm_to_clip_to_zero"): grad = global_info.set_zero_on_high_global_norm( grad, grad_norm_threshold=grad_norm_to_clip_to_zero, global_norm_tag=global_norm_tag) updater_opts.assert_all_read() opt_key, _ = self._get_optimizer_item_for_variable(var) apply_grad_opts = { "opt_key": opt_key, "accum_grad_multiple_num_steps": accum_grad_multiple_num_steps} return grad, apply_grad_opts