def test_loss(self, dist_norm, top_k, worst_case_loss): image_shape = (12, 12, 1) num_classes = 10 batch_size = 3 images = tf.convert_to_tensor(np.random.rand(*((batch_size, ) + image_shape)), dtype=tf.float32) labels = tf.convert_to_tensor(np.random.randint(0, high=num_classes, size=batch_size), dtype=tf.int32) # Toy model. endpoints = {} endpoints["input_layer"] = images # Convolution layer. net = tf.keras.layers.Conv2D(filters=8, kernel_size=3, strides=(1, 1), padding="same", activation=tf.nn.relu)(images) endpoints["conv_layer"] = net # Global average pooling layer. net = tf.reduce_mean(net, axis=[1, 2]) # Output layer. logits = tf.keras.layers.Dense(num_classes)(net) loss = margin_loss.large_margin( logits=logits, one_hot_labels=tf.one_hot(labels, num_classes), layers_list=[endpoints["input_layer"], endpoints["conv_layer"]], gamma=10000, alpha_factor=4, top_k=top_k, dist_norm=dist_norm, worst_case_loss=worst_case_loss) var_list = tf.global_variables() init = tf.global_variables_initializer() # Test gradients are not None. gs = tf.gradients(loss, var_list) for g in gs: self.assertIsNotNone(g) # Test loss shape. with self.test_session() as sess: sess.run(init) self.assertEqual(sess.run(loss).shape, ())
def add_optimizer_op(self, loss, lr_input): if self.optimizer == "gd": optimizer = tf.train.GradientDescentOptimizer(lr_input) elif self.optimizer == "adadelta": optimizer = tf.train.AdadeltaOptimizer(lr_input) elif self.optimizer == "adagrad": optimizer = tf.train.AdagradOptimizer(lr_input) elif self.optimizer == "adam": optimizer = tf.train.AdamOptimizer(lr_input, beta1=self.beta1, beta2=self.beta2, epsilon=self.epsilon) elif self.optimizer == "momentum": optimizer = tf.train.MomentumOptimizer(lr_input, self.momentum) elif self.optimizer == "rmsprop": optimizer = tf.train.RMSPropOptimizer(lr_input, momentum=self.momentum) else: print( "Optimizer arg should be one of [gd, adadelta, adagrad, adam, momentum, rmsprop]." ) return None if self.clipping_norm > 0 or self.save_weights: trainables = tf.trainable_variables() grads = tf.gradients(loss, trainables) if self.save_weights: for i in range(len(grads)): util.add_summaries("", self.name, grads[i], header_name=grads[i].name + "/", save_stddev=True, save_mean=True) if self.clipping_norm > 0: clipped_grads, _ = tf.clip_by_global_norm( grads, clip_norm=self.clipping_norm) grad_var_pairs = zip(clipped_grads, trainables) training_optimizer = optimizer.apply_gradients(grad_var_pairs) else: training_optimizer = optimizer.minimize(loss) return training_optimizer
def compute_gradients(self, loss, var_list=None, **kwargs): if not var_list: var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if self._l1: l1 = tf.add_n( [tf.reduce_sum(tf.abs(p)) * self._reg_factor for p in var_list]) loss = loss + l1 * self._l1 if self._l2: l2 = tf.add_n( [tf.reduce_sum(tf.square(p)) * self._reg_factor for p in var_list]) loss = loss + l2 * self._l2 grads_and_vars = zip( tf.gradients(loss, var_list, colocate_gradients_with_ops=True), var_list) return grads_and_vars
def dense_transformer_fwd_and_grad(transformer, input_activation): output_activation = transformer.encoder_layer(input_activation, mask=None, debug_name="layer_0") loss = tf.reduce_sum(output_activation) optimizer = tf.train.AdamOptimizer(learning_rate=1e-3) grads = optimizer.compute_gradients(loss) input_grad = tf.gradients(loss, input_activation)[0] with tf.control_dependencies([input_grad]): train_op = optimizer.apply_gradients(grads) with tf.control_dependencies([train_op]): streamOps = {"output_activation": output_activation} streamOps["input_grad"] = input_grad for grad, var in grads: streamOps[var.op.name + "_grad"] = grad return streamOps
def create_training_method(self): # Define training optimizer L2 = 0.001 self.v_input = tf.placeholder("float", [None, 1]) weight_decay = tf.add_n([L2 * tf.nn.l2_loss(var) for var in self.net]) self.loss = tf.reduce_mean( tf.square(self.v_input - self.q_value_output)) self.cost = self.loss + weight_decay self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.cost) action_com_var_list = self.action_net + [self.net[11], self.net[12]] self.optimizer_action_com = tf.train.AdamOptimizer(self.lr).minimize( self.cost, var_list=action_com_var_list) state_com_var_list = self.state_net + self.combined_net self.optimizer_state_com = tf.train.AdamOptimizer(self.lr).minimize( self.cost, var_list=state_com_var_list) self.action_gradients = tf.gradients(self.q_value_output, self.action_input)
def loop_series_terms(k, output_grads, trace): """ :param k: loop over k terms :param output_grads: (batch_size, height, width, num_channel) :param trace: (batch_size,) :return: """ # shape (batch_size, height, width, num_channel) grads = tf.gradients(Gx, x, output_grads)[0] # shape (batch_size, 1, h*w*c) grads_reshaped = tf.reshape(grads, (u_shape[0], 1, -1)) trace = trace + tf.squeeze(tf.cond(tf.equal(k % 2, 0), lambda: 1.0, lambda: -1.0) *\ tf.matmul(grads_reshaped, u_reshaped) / tf.cast(k + 1, tf.float32), axis= [1, 2]) return k + 1, grads, trace
def energy_and_forces_from_atomic_properties(self, Ea, Qa, Dij, Z, R, idx_i, idx_j, Q_tot=None, batch_seg=None): with tf.compat.v1.name_scope( "energy_and_forces_from_atomic_properties"): energy = self.energy_from_atomic_properties( Ea, Qa, Dij, Z, idx_i, idx_j, Q_tot, batch_seg) forces = -tf.convert_to_tensor(value=tf.gradients( ys=tf.reduce_sum(input_tensor=energy), xs=R)[0]) return energy, forces
def _compute_and_apply_gradients(self, loss): """Compute the gradients for all variables and apply them to the variables. We alter the internal self._custom_getter_variable_cache with new "variables" for which a gradient descent step has been applied. Args: loss: The loss tensor we want to derive the gradients for. Raises: ValueError: In case we try to compute the gradients without ever having populated our custom_gette scope. """ if not self._custom_getter_variable_cache: raise ValueError( 'Our custom getter has to be invoked at least once before' 'we can compute gradients.') # We keep track of the previously used variables. self._variable_cache.append(self._custom_getter_variable_cache) # The old cache contains the latest variable state. variable_cache_old = self._variable_cache[-1] # The new cache will contain the updated variables. self._custom_getter_variable_cache = {} variable_list = list(variable_cache_old.keys()) gradients = tf.gradients( [loss], [variable_cache_old[name] for name in variable_list]) for name, gradient in zip(variable_list, gradients): # In case we change the model in an iteration. ignore_var = ( self._var_scope is not None and not name.startswith(self._var_scope)) if (gradient is None or ignore_var): self._custom_getter_variable_cache[name] = variable_cache_old[name] continue if self._learn_inner_lr: learning_rate = self._get_learning_rate(name) else: learning_rate = self._learning_rate scaled_gradient = learning_rate * gradient if not self._use_second_order: scaled_gradient = tf.stop_gradient(scaled_gradient) self._custom_getter_variable_cache[name] = ( variable_cache_old[name] - scaled_gradient)
def testBatchNormIsTraining(self, is_training): feature_dims = (128, 128) inputs = tf.random.uniform((3, 4), dtype=tf.float64, seed=1) projection_head = projection_head_lib.ProjectionHead( feature_dims=feature_dims, use_batch_norm=True) outputs = projection_head(inputs, training=is_training) statistics_vars = [ var for var in tf.all_variables() if 'moving_' in var.name ] self.assertLen(statistics_vars, 2) grads = tf.gradients(outputs, statistics_vars) self.assertLen(grads, 2) if is_training: self.assertAllEqual([None, None], grads) self.assertTrue(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) else: self.assertNotIn(None, grads)
def _loss_gradient(self, x: "Tensor", y: "Tensor", mask: "Tensor") -> "Tensor": """Define loss gradients computation operation for a batch of padded inputs.""" import tensorflow.compat.v1 as tf1 # create decoder inputs decoder_inputs = self._create_decoder_input(x, y, mask) # call decoder if self._metrics is None: with self._cluster, tf1.device(self._cluster.GetPlacer()): self._metrics = self._task.FPropDefaultTheta(decoder_inputs) # compute loss gradient loss = tf1.get_collection("per_loss")[0] loss_gradient = tf1.gradients(loss, [x])[0] return loss_gradient
def get_dense_grad_w(self, loss: tf.Tensor) -> tf.Tensor: """ Access the TensorFlow variable that is holding the dense gradient for this layer. The dense gradient is conditionally computed so may be stale. """ dummy_var = self.get_dense_dummy_var() logger.debug(f"Layer '{self.name}' grad dummy var: '{dummy_var}'") dense_grad = tf.gradients(loss, dummy_var)[0] if dense_grad is None: raise ValueError( f"This sparse layer '{self.name}' is being asked to return a dense gradient " "but the loss op does not depend on it. Make sure the loss op is dependent " "on the output of this layer.") return dense_grad
def build_backward(self, dLdA): """Connects the next layer to the current layer using the backward pass process. Args: dLdA (Tensor): An n by b tensor representing the gradient of the loss with respect to this layer's activation for a batch of size b. Returns: Tensor: An m by b tensor, dLdZ, representing the gradient of the loss with respect to the previous layer's pre-activation for a batch of size b. (Note: n and m are equal.) """ return tf.gradients(self.A, self.Z, dLdA)
def __init__(self, sess, state_dim, action_dim, action_bound, learning_rate, tau, batch_size): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.action_bound = action_bound self.learning_rate = learning_rate self.tau = tau self.batch_size = batch_size # Actor Network self.inputs, self.out, self.scaled_out = self.create_actor_network() self.network_params = tf.trainable_variables() # Target Network self.target_inputs, self.target_out, self.target_scaled_out = self.create_actor_network( ) self.target_network_params = tf.trainable_variables( )[len(self.network_params):] # Op for periodically updating target network with online network # weights self.update_target_network_params = \ [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1. - self.tau)) for i in range(len(self.target_network_params))] # This gradient will be provided by the critic network self.action_gradient = tf.placeholder(tf.float32, [None, self.a_dim]) # Combine the gradients here self.unnormalized_actor_gradients = tf.gradients( self.scaled_out, self.network_params, -self.action_gradient) self.actor_gradients = list( map(lambda x: tf.div(x, self.batch_size), self.unnormalized_actor_gradients)) # Optimization Op self.optimize = tf.train.AdamOptimizer(self.learning_rate). \ apply_gradients(zip(self.actor_gradients, self.network_params)) self.num_trainable_vars = len(self.network_params) + len( self.target_network_params)
def __init__(self, loss, weights, sess=None): """ Create a gradient aggregator. See 'Optimizer' class for a description of the arguments. """ self._loss = loss self._weights = weights if sess is None: self._sess = tf.get_default_session() else: self._sess = sess dtype = self._loss.dtype shapes = [w.shape.as_list() for w in self._weights] self._grads = tf.gradients(loss, weights) # Create variables to store the reference gradient and the weights with # which the reference gradient was evaluated. self._ws_ref = [ tf.Variable(tf.zeros(shape=s, dtype=dtype)) for s in shapes ] self._grads_ref = [tf.Variable(tf.zeros(shape=s, dtype=dtype)) \ for s in shapes] # Since we need to evaluate the gradient with different weights, we need # backup variables to swap weights. self._ws_temp = [tf.Variable(tf.zeros(shape=s, dtype=dtype)) \ for s in shapes] # Finally, we need variables to store the actual gradient. self._grads_aggregated = [tf.Variable(tf.zeros(shape=s, dtype=dtype)) \ for s in shapes] # Variables to keep track of the number of samples used to compute the # reference gradient. self._num_ref_samples = tf.Variable(tf.zeros(shape=[], dtype=tf.int32)) self._num_ref_samples_batch = tf.placeholder(shape=[], dtype=tf.int32) self._update_ref_ops = self._update_ref() self._reset_ref_ops = self._reset_ref() self._ref_add_batch_ops = self._ref_add_batch() self._finalize_ref_ops = self._finalize_ref()
def setUp(self): super().setUp() self.max_sigma = 10 with tf.Graph().as_default() as graph: self.x = tf.placeholder(shape=[None, 5, 5, 1], dtype=tf.float32) y = tf.sin(self.x) y_sum = tf.reduce_sum(y, [1, 2, 3]) self.gradients_node = tf.gradients(y, self.x)[0] self.sess = tf.Session(graph=graph) self.sess_spy = mock.MagicMock(wraps=self.sess) # All black except 2 pixels near the center. self.x_input_val = np.array([ [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.5, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], ], dtype=float) self.x_input_val = self.x_input_val.reshape((5, 5, 1)) # Calculate the value of `y` at the input. y_input_val = self.sess.run(y, feed_dict={self.x: [self.x_input_val]}) # Baseline is the fully blurred version of the input. x_baseline_val = gaussian_filter( self.x_input_val, sigma=[self.max_sigma, self.max_sigma, 0], mode='constant') y_baseline_val = self.sess.run( y, feed_dict={self.x: [x_baseline_val]}) # The expected BlurIG value is equal to the difference between # the `y` value at the input and the `y` value at the baseline. Because # each value is independent, we can calculate the expected blur_ig value # of each. # # Expected: [[-0, -0, -0, -0, -0], # [-0, 0.641, -0, -0, -0], # [-0, -0, 0.838, -0, -0], # [-0, -0, -0, -0, -0], # [-0, -0, -0, -0, -0] self.expected_val = y_input_val[0] - y_baseline_val[0] self.blur_ig_instance = blur_ig.BlurIG(graph, self.sess_spy, y_sum, self.x)
def __init__(self, sess, state_dim, action_dim, learning_rate, tau, gamma, num_actor_vars): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.learning_rate = learning_rate self.tau = tau self.gamma = gamma # Create the critic network self.inputs, self.action, self.out = self.create_critic_network() self.network_params = tf.trainable_variables()[num_actor_vars:] # Target Network self.target_inputs, self.target_action, self.target_out = self.create_critic_network( ) self.target_network_params = tf.trainable_variables()[( len(self.network_params) + num_actor_vars):] # Op for periodically updating target network with online network # weights with regularization self.update_target_network_params = \ [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) \ + tf.multiply(self.target_network_params[i], 1. - self.tau)) for i in range(len(self.target_network_params))] # Network target (y_i) self.predicted_q_value = tf.placeholder(tf.float32, [None, 1]) # Define loss and optimization Op self.loss = tf.losses.mean_squared_error(self.predicted_q_value, self.out) self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(self.update_ops): self.optimize = tf.train.AdamOptimizer( self.learning_rate).minimize(self.loss) # Get the gradient of the net w.r.t. the action. # For each action in the minibatch (i.e., for each x in xs), # this will sum up the gradients of each critic output in the minibatch # w.r.t. that action. Each output is independent of all # actions except for one. self.action_grads = tf.gradients(self.out, self.action)
def test_attention(self): with self.session() as sess: with tf.device('/GPU:0'): if FLAGS.precision == "fp32": self.precision=tf.float32 else: self.precision=tf.float16 # batch and seq size that fit into a single GPU collected from https://github.com/ROCmSoftwarePlatform/BERT#out-of-memory-issues batch_size = FLAGS.batch seq_length = FLAGS.seq_length # number of heads for BERT base model collected from https://github.com/ROCmSoftwarePlatform/BERT#pre-trained-models attention_head_size = FLAGS.attention_head_size num_attention_heads = FLAGS.num_attention_heads # default dropout prob in BERT model collected from https://github.com/ROCmSoftwarePlatform/BERT/blob/bee6030e31e42a9394ac567da170a89a98d2062f/modeling.py#L42 attention_probs_dropout_prob = 0.1 # initialize layer and weight for dense layers input initializer_range = 0.2 layer_input = init_rand_variable([batch_size * seq_length, attention_head_size * num_attention_heads],self.precision) attention_mask = init_ones([batch_size,seq_length,seq_length]) attention_head_gpu = attention_layer( from_tensor=layer_input, to_tensor=layer_input, precision=self.precision, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=True, batch_size=batch_size, from_seq_length=seq_length, to_seq_length=seq_length) attention_head_gpu_gradient = tf.gradients(ys=attention_head_gpu, xs=layer_input) init_op = tf.group(tf.compat.v1.global_variables_initializer(), tf.compat.v1.local_variables_initializer()) sess.run(init_op) for _ in range(FLAGS.iter): sess.run(attention_head_gpu_gradient)
def build_graph(self): self.build_datapipeline() xb, yb = self.dataset_iterator.get_next() logits = self.model(xb) self.loss = self.loss_func(yb, logits) self.variables = [(v, i) for i, v in enumerate(tf.trainable_variables()) if 'dense' in v.name] # dont apply to last dense layer self.variables.pop(-1) self.vs = [ tf.random.normal((v.shape.as_list()[-1], 1), mean=0., stddev=1.) for v, _ in self.variables ] assert len(self.variables) > 0 # spectral norm reg grads = tf.gradients(self.loss, tf.trainable_variables()) new_vs = [] for (var, idx), v in zip(self.variables, self.vs): original_shape = grads[idx].shape W_grad = tf.reshape(grads[idx], [-1, var.shape[-1]]) W = tf.reshape(var, [-1, var.shape[-1]]) u = W @ v v = tf.transpose(W) @ u sigma = tf.norm(u, 2) / tf.norm(v, 2) reg_value = sigma * (u @ tf.transpose(v)) W_grad += self.reg_constant * reg_value grads[idx] = tf.reshape(W_grad, original_shape) new_vs.append(v) self.vs = new_vs self.acc, self.acc_op = tf.metrics.accuracy(tf.argmax(yb, 1), tf.argmax(logits, 1), name='acc') self.acc_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope="acc") self.acc_initializer = tf.variables_initializer(var_list=self.acc_vars) self.train_op = self.optimizer.apply_gradients( zip(grads, tf.trainable_variables()))
def test_lnorm(self): with self.session() as sess: with tf.device('/GPU:0'): hidden_size = FLAGS.hidden_size #initialize x_trf x_trf = init_weights([hidden_size, hidden_size]) context_layer_gpu = layer_norm(x_trf) context_layer_gpu_gradient = tf.gradients(ys=context_layer_gpu, xs=x_trf) init_op = tf.group(tf.compat.v1.global_variables_initializer(), tf.compat.v1.local_variables_initializer()) sess.run(init_op) for _ in range(FLAGS.iter): sess.run(context_layer_gpu_gradient)
def train_step(x, y): y_hat = model(x, training=True) loss = loss_fn(y, y_hat) all_vars = [] for v in model.trainable_variables: all_vars.append(v) grads = tf.gradients(loss, all_vars) #grads = tf.gradients(tf.negative(loss), all_vars) #gradient ascent attack update = optimizer.apply_gradients(zip(grads, all_vars)) #new_grads = grads #sign-flipping attack #i=0 #for tmp in grads: # new_grads[i] = tf.reverse(tmp, [0]) # i+=1 #update = optimizer.apply_gradients(zip(new_grads, all_vars)) return loss, optimizer.iterations, update, y, y_hat
def testTwoOps(self): """Tests that the op can be instantiated twice with appropriate results. Implementations with inappropriate global registration of gradients will fail this test. """ x = tf.placeholder(tf.float32, [1]) y = x * x y = snt.scale_gradient(y, 0.1) y = snt.scale_gradient(y, 0.1) dydx = tf.gradients([y], [x])[0] with self.test_session() as sess: dydx_, y_ = sess.run([dydx, y], feed_dict={x: [3.0]}) self.assertAlmostEqual(dydx_[0], 2 * 0.1**2 * 3.0, places=6) self.assertAlmostEqual(y_[0], 3.0**2, places=6)
def build(self, num_inputs, num_outputs, num_targets): # build a new graph and session in which output neuron training will take place self.graph = tf.Graph() self.sess = tf.Session(graph=self.graph) with self.graph.as_default(): # variables common to all neurons self.targets = tf.placeholder(dtype=dtype, shape=[None, num_targets], name='targets') self.inputs, self.neurons, self.losses, self.train_ops, self.place_and_assign, variables = \ [], [], [], [], [], [] # group neurons based on their inputs for each_num_inputs in num_inputs: inputs = tf.placeholder(dtype=dtype, shape=[None, each_num_inputs], name='inputs') self.inputs.append(inputs) # group neurons based on their activations for activation in self.activations: # construct prediction of output neuron y_pred, weights, placeholders, assign_ops = self._build_assignable_perceptron( inputs, num_outputs, activation, self.weight_init) self.neurons.append((weights, activation)) self.place_and_assign.append((placeholders, assign_ops)) # construct loss function loss = reg_loss = self.loss_function(self.targets, y_pred) self.losses.append(loss) if self.regularizer is not None: reg_loss = reg_loss + self.reg_penalty * self.regularizer( weights[0]) # construct optimizer self.train_ops.append( (reg_loss, tf.gradients(reg_loss, weights))) variables.extend(weights) # initialize variables self.sess.run(tf.variables_initializer(variables))
def _compute_gradients(self, loss, var_list=None): """Compute gradients using RTRL. The default compute_gradients method of the given optimizer will be called. Besides, a Block in the register may add additional gradients to its corresponding weight gradients if a `` method if provided. :param loss: the loss tensor :param var_list: variable list (may not be used for now) :return: A list of (gradient, variable) pairs as the compute_gradients method does in tf.Optimizer """ # Sanity check assert isinstance(loss, tf.Tensor) # Compute gradients using default method assert isinstance(self._register, NodeRegister) default_grads_and_vars = self._tf_optimizer.compute_gradients( loss, var_list=self._register.default_var_list) # Compute gradients using customized method held dL_dy = tf.gradients(loss, self._rnn.last_scan_output)[0] c_g_n_v, new_buffer = self._register.compute_customized_gradient(dL_dy) self._rnn.grad_buffer_slot.plug(new_buffer) grads_and_vars = default_grads_and_vars + c_g_n_v if th.test_grad: _grads_and_vars = self._tf_optimizer.compute_gradients(loss) deltas_and_vars = [] deltas = [] for _g, _v in _grads_and_vars: matches = [g for g, v in grads_and_vars if v is _v] assert len(matches) == 1 g = matches[0] delta_name = '_'.join(_v.name.split('/')) delta = tf.subtract(g, _g, name='delta_{}'.format(delta_name[:-2])) deltas_and_vars.append((delta, _v)) deltas.append(delta) self._rnn.grad_delta_slot.plug(tuple(deltas)) return grads_and_vars
def create_optimizer(loss, learning_rate, num_train_steps, weight_decay_rate=0.0, use_tpu=False, warmup_steps=0, warmup_proportion=0, lr_decay_power=1.0, layerwise_lr_decay_power=-1, n_transformer_layers=None): """Creates an optimizer and training op.""" global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=lr_decay_power, cycle=False) warmup_steps = max(num_train_steps * warmup_proportion, warmup_steps) learning_rate *= tf.minimum( 1.0, tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32)) if layerwise_lr_decay_power > 0: learning_rate = _get_layer_lrs(learning_rate, layerwise_lr_decay_power, n_transformer_layers) optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=weight_decay_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if use_tpu: optimizer = tf.tpu.CrossShardOptimizer(optimizer) tvars = tf.trainable_variables() grads = tf.gradients(loss, tvars) (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op
def test1DAreaMax(self): batch_size = 256 feature_len = 100 heads = 2 key_len = 15 depth = 128 max_area_width = 3 queries = tf.random_uniform([batch_size, heads, key_len, depth], minval=-10.0, maxval=10.0) features = tf.random_uniform([batch_size, heads, feature_len, depth], minval=-10.0, maxval=10.0) feature_length = tf.constant( np.concatenate( (np.random.randint(max_area_width, feature_len, [batch_size - 1]), np.array([feature_len])), axis=0), tf.int32) base_mask = tf.expand_dims(tf.sequence_mask(feature_length), 1) mask = tf.expand_dims(base_mask, 3) mask = tf.tile(mask, [1, heads, 1, depth]) features = tf.where(mask, features, tf.zeros_like(features)) # [batch, 1, 1, memory_length] bias_mask = tf.expand_dims(base_mask, 1) bias = tf.where( bias_mask, tf.zeros_like(bias_mask, tf.float32), tf.ones_like(bias_mask, tf.float32) * -1e9) target_values = tf.random_uniform([batch_size, heads, key_len, depth], minval=-0.2, maxval=0.2) keys = tf.layers.dense(features, units=depth) values = tf.layers.dense(features, units=depth) max_attention = area_attention.dot_product_area_attention( queries, keys, values, bias=bias, area_key_mode="max", area_value_mode="max", name="max_key", max_area_width=max_area_width) max_gradients = tf.gradients( tf.reduce_mean( tf.pow(target_values - max_attention, 2)), features) with self.test_session() as session: session.run(tf.global_variables_initializer()) result1, result2 = session.run([max_gradients, max_attention]) self.assertFalse(np.any(np.logical_not(np.isfinite(result1)))) self.assertFalse(np.any(np.logical_not(np.isfinite(result2))))
def testTrainingFalse(self, ctor, depth): batch_size = 2 out_channels = 2048 if depth >= 50 else 512 expected_output_shape = [batch_size, out_channels] inputs = tf.random.uniform(dtype=tf.float32, shape=[batch_size, 224, 224, 3], seed=1) resnet = ctor(depth) outputs = resnet(inputs, training=False) gradient = tf.gradients(outputs, inputs) self.assertListEqual(expected_output_shape, outputs.shape.as_list()) self.assertEqual(inputs.dtype, outputs.dtype) self.assertIsNotNone(gradient) with self.cached_session() as sess: sess.run(tf.compat.v1.global_variables_initializer()) outputs = sess.run(outputs) # Make sure that there are no NaNs self.assertFalse(np.isnan(outputs).any())
def __init__(self, sess, state_dim, action_dim, learning_rate): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim self.lr_rate = learning_rate # Create the actor network self.inputs, self.out = self.create_actor_network() # Get all network parameters self.network_params = \ tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') # Set all network parameters self.input_network_params = [] for param in self.network_params: self.input_network_params.append( tf.placeholder(tf.float32, shape=param.get_shape())) self.set_network_params_op = [] for idx, param in enumerate(self.input_network_params): self.set_network_params_op.append( self.network_params[idx].assign(param)) # Selected action, 0-1 vector self.acts = tf.placeholder(tf.float32, [None, self.a_dim]) # This gradient will be provided by the critic network self.act_grad_weights = tf.placeholder(tf.float32, [None, 1]) # Compute the objective (log action_vector and entropy) self.obj = tf.reduce_sum(tf.multiply( tf.log(tf.reduce_sum(tf.multiply(self.out, self.acts), reduction_indices=1, keep_dims=True)), -self.act_grad_weights)) \ + ENTROPY_WEIGHT * tf.reduce_sum(tf.multiply(self.out, tf.log(self.out + ENTROPY_EPS))) # Combine the gradients here self.actor_gradients = tf.gradients(self.obj, self.network_params) # Optimization Op self.optimize = tf.train.RMSPropOptimizer(self.lr_rate).\ apply_gradients(zip(self.actor_gradients, self.network_params))
def jacobian_graph(predictions, x, nb_classes): """ Create the Jacobian graph to be ran later in a TF session :param predictions: the model's symbolic output (linear output, pre-softmax) :param x: the input placeholder :param nb_classes: the number of classes the model has :return: """ # This function will return a list of TF gradients list_derivatives = [] # Define the TF graph elements to compute our derivatives for each class for class_ind in xrange(nb_classes): derivatives, = tf.gradients(predictions[:, class_ind], x) list_derivatives.append(derivatives) return list_derivatives
def compute_gradient(self, objective, argument): """ Compute the gradient of 'objective' and return as a function. """ tfgrad = tf.gradients(objective, argument) if not isinstance(argument, list): def grad(x): feed_dict = {argument: x} return self._session.run(tfgrad[0], feed_dict) else: def grad(x): feed_dict = {i: d for i, d in zip(argument, x)} return self._session.run(tfgrad, feed_dict) return grad
def __init__(self, sess, state_dim, action_dim, learning_rate): self.sess = sess self.state_dim = state_dim self.action_dim = action_dim self.learning_rate = learning_rate #크리틱 신경망 생성 self.model, self.phi, self.states = build_network(self.state_dim) #시간차 타깃을 담을 플레이스 홀더 self.td_targets = tf.placeholder(tf.float32, [None, 1]) #글로벌 손실함수와 그래디언트 v_values = self.model.output loss = tf.reduce_sum(tf.square(self.td_targets - v_values)) dj_dphi = tf.gradients(loss, self.phi) #그래디언트 클리핑 dj_dphi, _ = tf.clip_by_global_norm(dj_dphi, 40) #그래디언트를 이용한 글로벌 신경망 업데이트 grads = zip(dj_dphi, self.phi) self.critic_optimizer = tf.train.AdamOptimizer( self.learning_rate).apply_gradients(grads)