def test_loss(self, dist_norm, top_k, worst_case_loss):
        image_shape = (12, 12, 1)
        num_classes = 10
        batch_size = 3
        images = tf.convert_to_tensor(np.random.rand(*((batch_size, ) +
                                                       image_shape)),
                                      dtype=tf.float32)
        labels = tf.convert_to_tensor(np.random.randint(0,
                                                        high=num_classes,
                                                        size=batch_size),
                                      dtype=tf.int32)
        # Toy model.
        endpoints = {}
        endpoints["input_layer"] = images
        # Convolution layer.
        net = tf.keras.layers.Conv2D(filters=8,
                                     kernel_size=3,
                                     strides=(1, 1),
                                     padding="same",
                                     activation=tf.nn.relu)(images)
        endpoints["conv_layer"] = net
        # Global average pooling layer.
        net = tf.reduce_mean(net, axis=[1, 2])
        # Output layer.
        logits = tf.keras.layers.Dense(num_classes)(net)
        loss = margin_loss.large_margin(
            logits=logits,
            one_hot_labels=tf.one_hot(labels, num_classes),
            layers_list=[endpoints["input_layer"], endpoints["conv_layer"]],
            gamma=10000,
            alpha_factor=4,
            top_k=top_k,
            dist_norm=dist_norm,
            worst_case_loss=worst_case_loss)
        var_list = tf.global_variables()
        init = tf.global_variables_initializer()

        # Test gradients are not None.
        gs = tf.gradients(loss, var_list)
        for g in gs:
            self.assertIsNotNone(g)

        # Test loss shape.
        with self.test_session() as sess:
            sess.run(init)
            self.assertEqual(sess.run(loss).shape, ())
Example #2
0
    def add_optimizer_op(self, loss, lr_input):

        if self.optimizer == "gd":
            optimizer = tf.train.GradientDescentOptimizer(lr_input)
        elif self.optimizer == "adadelta":
            optimizer = tf.train.AdadeltaOptimizer(lr_input)
        elif self.optimizer == "adagrad":
            optimizer = tf.train.AdagradOptimizer(lr_input)
        elif self.optimizer == "adam":
            optimizer = tf.train.AdamOptimizer(lr_input,
                                               beta1=self.beta1,
                                               beta2=self.beta2,
                                               epsilon=self.epsilon)
        elif self.optimizer == "momentum":
            optimizer = tf.train.MomentumOptimizer(lr_input, self.momentum)
        elif self.optimizer == "rmsprop":
            optimizer = tf.train.RMSPropOptimizer(lr_input,
                                                  momentum=self.momentum)
        else:
            print(
                "Optimizer arg should be one of [gd, adadelta, adagrad, adam, momentum, rmsprop]."
            )
            return None

        if self.clipping_norm > 0 or self.save_weights:
            trainables = tf.trainable_variables()
            grads = tf.gradients(loss, trainables)

            if self.save_weights:
                for i in range(len(grads)):
                    util.add_summaries("",
                                       self.name,
                                       grads[i],
                                       header_name=grads[i].name + "/",
                                       save_stddev=True,
                                       save_mean=True)

        if self.clipping_norm > 0:
            clipped_grads, _ = tf.clip_by_global_norm(
                grads, clip_norm=self.clipping_norm)
            grad_var_pairs = zip(clipped_grads, trainables)
            training_optimizer = optimizer.apply_gradients(grad_var_pairs)
        else:
            training_optimizer = optimizer.minimize(loss)

        return training_optimizer
Example #3
0
  def compute_gradients(self, loss, var_list=None, **kwargs):
    if not var_list:
      var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

    if self._l1:
      l1 = tf.add_n(
          [tf.reduce_sum(tf.abs(p)) * self._reg_factor for p in var_list])
      loss = loss + l1 * self._l1
    if self._l2:
      l2 = tf.add_n(
          [tf.reduce_sum(tf.square(p)) * self._reg_factor for p in var_list])
      loss = loss + l2 * self._l2

    grads_and_vars = zip(
        tf.gradients(loss, var_list, colocate_gradients_with_ops=True),
        var_list)
    return grads_and_vars
def dense_transformer_fwd_and_grad(transformer, input_activation):
    output_activation = transformer.encoder_layer(input_activation,
                                                  mask=None,
                                                  debug_name="layer_0")
    loss = tf.reduce_sum(output_activation)
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
    grads = optimizer.compute_gradients(loss)
    input_grad = tf.gradients(loss, input_activation)[0]
    with tf.control_dependencies([input_grad]):
        train_op = optimizer.apply_gradients(grads)

    with tf.control_dependencies([train_op]):
        streamOps = {"output_activation": output_activation}
        streamOps["input_grad"] = input_grad
        for grad, var in grads:
            streamOps[var.op.name + "_grad"] = grad
        return streamOps
Example #5
0
 def create_training_method(self):
     # Define training optimizer
     L2 = 0.001
     self.v_input = tf.placeholder("float", [None, 1])
     weight_decay = tf.add_n([L2 * tf.nn.l2_loss(var) for var in self.net])
     self.loss = tf.reduce_mean(
         tf.square(self.v_input - self.q_value_output))
     self.cost = self.loss + weight_decay
     self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.cost)
     action_com_var_list = self.action_net + [self.net[11], self.net[12]]
     self.optimizer_action_com = tf.train.AdamOptimizer(self.lr).minimize(
         self.cost, var_list=action_com_var_list)
     state_com_var_list = self.state_net + self.combined_net
     self.optimizer_state_com = tf.train.AdamOptimizer(self.lr).minimize(
         self.cost, var_list=state_com_var_list)
     self.action_gradients = tf.gradients(self.q_value_output,
                                          self.action_input)
            def loop_series_terms(k, output_grads, trace):
                """

        :param k: loop over k terms
        :param output_grads: (batch_size, height, width, num_channel)
        :param trace: (batch_size,)
        :return:
        """
                # shape (batch_size, height, width, num_channel)
                grads = tf.gradients(Gx, x, output_grads)[0]
                # shape (batch_size, 1, h*w*c)
                grads_reshaped = tf.reshape(grads, (u_shape[0], 1, -1))


                trace = trace + tf.squeeze(tf.cond(tf.equal(k % 2, 0), lambda: 1.0, lambda: -1.0) *\
                                     tf.matmul(grads_reshaped, u_reshaped) / tf.cast(k + 1, tf.float32), axis= [1, 2])
                return k + 1, grads, trace
Example #7
0
 def energy_and_forces_from_atomic_properties(self,
                                              Ea,
                                              Qa,
                                              Dij,
                                              Z,
                                              R,
                                              idx_i,
                                              idx_j,
                                              Q_tot=None,
                                              batch_seg=None):
     with tf.compat.v1.name_scope(
             "energy_and_forces_from_atomic_properties"):
         energy = self.energy_from_atomic_properties(
             Ea, Qa, Dij, Z, idx_i, idx_j, Q_tot, batch_seg)
         forces = -tf.convert_to_tensor(value=tf.gradients(
             ys=tf.reduce_sum(input_tensor=energy), xs=R)[0])
     return energy, forces
Example #8
0
  def _compute_and_apply_gradients(self, loss):
    """Compute the gradients for all variables and apply them to the variables.

    We alter the internal self._custom_getter_variable_cache with new
    "variables" for which a gradient descent step has been applied.

    Args:
      loss: The loss tensor we want to derive the gradients for.

    Raises:
      ValueError: In case we try to compute the gradients without ever having
        populated our custom_gette scope.
    """

    if not self._custom_getter_variable_cache:
      raise ValueError(
          'Our custom getter has to be invoked at least once before'
          'we can compute gradients.')

    # We keep track of the previously used variables.
    self._variable_cache.append(self._custom_getter_variable_cache)

    # The old cache contains the latest variable state.
    variable_cache_old = self._variable_cache[-1]
    # The new cache will contain the updated variables.
    self._custom_getter_variable_cache = {}

    variable_list = list(variable_cache_old.keys())
    gradients = tf.gradients(
        [loss], [variable_cache_old[name] for name in variable_list])
    for name, gradient in zip(variable_list, gradients):
      # In case we change the model in an iteration.
      ignore_var = (
          self._var_scope is not None and not name.startswith(self._var_scope))
      if (gradient is None or ignore_var):
        self._custom_getter_variable_cache[name] = variable_cache_old[name]
        continue
      if self._learn_inner_lr:
        learning_rate = self._get_learning_rate(name)
      else:
        learning_rate = self._learning_rate
      scaled_gradient = learning_rate * gradient
      if not self._use_second_order:
        scaled_gradient = tf.stop_gradient(scaled_gradient)
      self._custom_getter_variable_cache[name] = (
          variable_cache_old[name] - scaled_gradient)
 def testBatchNormIsTraining(self, is_training):
     feature_dims = (128, 128)
     inputs = tf.random.uniform((3, 4), dtype=tf.float64, seed=1)
     projection_head = projection_head_lib.ProjectionHead(
         feature_dims=feature_dims, use_batch_norm=True)
     outputs = projection_head(inputs, training=is_training)
     statistics_vars = [
         var for var in tf.all_variables() if 'moving_' in var.name
     ]
     self.assertLen(statistics_vars, 2)
     grads = tf.gradients(outputs, statistics_vars)
     self.assertLen(grads, 2)
     if is_training:
         self.assertAllEqual([None, None], grads)
         self.assertTrue(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
     else:
         self.assertNotIn(None, grads)
    def _loss_gradient(self, x: "Tensor", y: "Tensor",
                       mask: "Tensor") -> "Tensor":
        """Define loss gradients computation operation for a batch of padded inputs."""
        import tensorflow.compat.v1 as tf1

        # create decoder inputs
        decoder_inputs = self._create_decoder_input(x, y, mask)

        # call decoder
        if self._metrics is None:
            with self._cluster, tf1.device(self._cluster.GetPlacer()):
                self._metrics = self._task.FPropDefaultTheta(decoder_inputs)

        # compute loss gradient
        loss = tf1.get_collection("per_loss")[0]
        loss_gradient = tf1.gradients(loss, [x])[0]
        return loss_gradient
Example #11
0
    def get_dense_grad_w(self, loss: tf.Tensor) -> tf.Tensor:
        """
        Access the TensorFlow variable that is holding the dense gradient for this layer.
        The dense gradient is conditionally computed so may be stale.
        """
        dummy_var = self.get_dense_dummy_var()

        logger.debug(f"Layer '{self.name}' grad dummy var: '{dummy_var}'")
        dense_grad = tf.gradients(loss, dummy_var)[0]

        if dense_grad is None:
            raise ValueError(
                f"This sparse layer '{self.name}' is being asked to return a dense gradient "
                "but the loss op does not depend on it. Make sure the loss op is dependent "
                "on the output of this layer.")

        return dense_grad
 def build_backward(self, dLdA):
     """Connects the next layer to the current layer using the
         backward pass process.
         
     Args:
         dLdA (Tensor): An n by b tensor representing the gradient of
             the loss with respect to this layer's activation for a 
             batch of size b.
             
     Returns:
         Tensor: An m by b tensor, dLdZ, representing the gradient of 
             the loss with respect to the previous layer's 
             pre-activation for a batch of size b. (Note: n and m are 
             equal.)
             
     """
     return tf.gradients(self.A, self.Z, dLdA)
Example #13
0
    def __init__(self, sess, state_dim, action_dim, action_bound,
                 learning_rate, tau, batch_size):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.action_bound = action_bound
        self.learning_rate = learning_rate
        self.tau = tau
        self.batch_size = batch_size

        # Actor Network
        self.inputs, self.out, self.scaled_out = self.create_actor_network()

        self.network_params = tf.trainable_variables()

        # Target Network
        self.target_inputs, self.target_out, self.target_scaled_out = self.create_actor_network(
        )

        self.target_network_params = tf.trainable_variables(
        )[len(self.network_params):]

        # Op for periodically updating target network with online network
        # weights
        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) +
                                                  tf.multiply(self.target_network_params[i], 1. - self.tau))
             for i in range(len(self.target_network_params))]

        # This gradient will be provided by the critic network
        self.action_gradient = tf.placeholder(tf.float32, [None, self.a_dim])

        # Combine the gradients here
        self.unnormalized_actor_gradients = tf.gradients(
            self.scaled_out, self.network_params, -self.action_gradient)
        self.actor_gradients = list(
            map(lambda x: tf.div(x, self.batch_size),
                self.unnormalized_actor_gradients))

        # Optimization Op
        self.optimize = tf.train.AdamOptimizer(self.learning_rate). \
            apply_gradients(zip(self.actor_gradients, self.network_params))

        self.num_trainable_vars = len(self.network_params) + len(
            self.target_network_params)
Example #14
0
    def __init__(self, loss, weights, sess=None):
        """
    Create a gradient aggregator.

    See 'Optimizer' class for a description of the arguments.
    """
        self._loss = loss
        self._weights = weights

        if sess is None:
            self._sess = tf.get_default_session()
        else:
            self._sess = sess

        dtype = self._loss.dtype
        shapes = [w.shape.as_list() for w in self._weights]

        self._grads = tf.gradients(loss, weights)

        # Create variables to store the reference gradient and the weights with
        # which the reference gradient was evaluated.
        self._ws_ref = [
            tf.Variable(tf.zeros(shape=s, dtype=dtype)) for s in shapes
        ]
        self._grads_ref = [tf.Variable(tf.zeros(shape=s, dtype=dtype)) \
                           for s in shapes]

        # Since we need to evaluate the gradient with different weights, we need
        # backup variables to swap weights.
        self._ws_temp = [tf.Variable(tf.zeros(shape=s, dtype=dtype)) \
                         for s in shapes]

        # Finally, we need variables to store the actual gradient.
        self._grads_aggregated = [tf.Variable(tf.zeros(shape=s, dtype=dtype)) \
                                  for s in shapes]

        # Variables to keep track of the number of samples used to compute the
        # reference gradient.
        self._num_ref_samples = tf.Variable(tf.zeros(shape=[], dtype=tf.int32))
        self._num_ref_samples_batch = tf.placeholder(shape=[], dtype=tf.int32)

        self._update_ref_ops = self._update_ref()
        self._reset_ref_ops = self._reset_ref()
        self._ref_add_batch_ops = self._ref_add_batch()
        self._finalize_ref_ops = self._finalize_ref()
Example #15
0
    def setUp(self):
        super().setUp()
        self.max_sigma = 10
        with tf.Graph().as_default() as graph:
            self.x = tf.placeholder(shape=[None, 5, 5, 1], dtype=tf.float32)
            y = tf.sin(self.x)
            y_sum = tf.reduce_sum(y, [1, 2, 3])
            self.gradients_node = tf.gradients(y, self.x)[0]
            self.sess = tf.Session(graph=graph)
            self.sess_spy = mock.MagicMock(wraps=self.sess)
            # All black except 2 pixels near the center.
            self.x_input_val = np.array([
                [0.0, 0.0, 0.0, 0.0, 0.0],
                [0.0, 0.5, 0.0, 0.0, 0.0],
                [0.0, 0.0, 1.0, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 0.0],
                [0.0, 0.0, 0.0, 0.0, 0.0],
            ],
                                        dtype=float)
            self.x_input_val = self.x_input_val.reshape((5, 5, 1))
            # Calculate the value of `y` at the input.
            y_input_val = self.sess.run(y,
                                        feed_dict={self.x: [self.x_input_val]})

            # Baseline is the fully blurred version of the input.
            x_baseline_val = gaussian_filter(
                self.x_input_val,
                sigma=[self.max_sigma, self.max_sigma, 0],
                mode='constant')
            y_baseline_val = self.sess.run(
                y, feed_dict={self.x: [x_baseline_val]})

            # The expected BlurIG value is equal to the difference between
            # the `y` value at the input and the `y` value at the baseline. Because
            # each value is independent, we can calculate the expected blur_ig value
            # of each.
            #
            # Expected: [[-0, -0, -0, -0, -0],
            #            [-0, 0.641, -0, -0, -0],
            #            [-0, -0, 0.838, -0, -0],
            #            [-0, -0, -0, -0, -0],
            #            [-0, -0, -0, -0, -0]
            self.expected_val = y_input_val[0] - y_baseline_val[0]
            self.blur_ig_instance = blur_ig.BlurIG(graph, self.sess_spy, y_sum,
                                                   self.x)
Example #16
0
    def __init__(self, sess, state_dim, action_dim, learning_rate, tau, gamma,
                 num_actor_vars):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.learning_rate = learning_rate
        self.tau = tau
        self.gamma = gamma

        # Create the critic network
        self.inputs, self.action, self.out = self.create_critic_network()

        self.network_params = tf.trainable_variables()[num_actor_vars:]

        # Target Network
        self.target_inputs, self.target_action, self.target_out = self.create_critic_network(
        )

        self.target_network_params = tf.trainable_variables()[(
            len(self.network_params) + num_actor_vars):]

        # Op for periodically updating target network with online network
        # weights with regularization
        self.update_target_network_params = \
            [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) \
            + tf.multiply(self.target_network_params[i], 1. - self.tau))
                for i in range(len(self.target_network_params))]

        # Network target (y_i)
        self.predicted_q_value = tf.placeholder(tf.float32, [None, 1])

        # Define loss and optimization Op
        self.loss = tf.losses.mean_squared_error(self.predicted_q_value,
                                                 self.out)
        self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(self.update_ops):
            self.optimize = tf.train.AdamOptimizer(
                self.learning_rate).minimize(self.loss)

        # Get the gradient of the net w.r.t. the action.
        # For each action in the minibatch (i.e., for each x in xs),
        # this will sum up the gradients of each critic output in the minibatch
        # w.r.t. that action. Each output is independent of all
        # actions except for one.
        self.action_grads = tf.gradients(self.out, self.action)
Example #17
0
  def test_attention(self):
    with self.session() as sess:
      with tf.device('/GPU:0'): 
        if FLAGS.precision == "fp32":
          self.precision=tf.float32
        else: 
          self.precision=tf.float16

        # batch and seq size that fit into a single GPU collected from https://github.com/ROCmSoftwarePlatform/BERT#out-of-memory-issues
        batch_size = FLAGS.batch
        seq_length = FLAGS.seq_length

        # number of heads for BERT base model collected from https://github.com/ROCmSoftwarePlatform/BERT#pre-trained-models
        attention_head_size = FLAGS.attention_head_size
        num_attention_heads = FLAGS.num_attention_heads

        # default dropout prob in BERT model collected from https://github.com/ROCmSoftwarePlatform/BERT/blob/bee6030e31e42a9394ac567da170a89a98d2062f/modeling.py#L42
        attention_probs_dropout_prob = 0.1

        # initialize layer and weight for dense layers input
        initializer_range = 0.2
        layer_input = init_rand_variable([batch_size * seq_length, attention_head_size * num_attention_heads],self.precision)
        attention_mask = init_ones([batch_size,seq_length,seq_length])

        attention_head_gpu = attention_layer(
                from_tensor=layer_input,
                to_tensor=layer_input,
                precision=self.precision,
                attention_mask=attention_mask,
                num_attention_heads=num_attention_heads,
                size_per_head=attention_head_size,
                attention_probs_dropout_prob=attention_probs_dropout_prob,
                initializer_range=initializer_range,
                do_return_2d_tensor=True,
                batch_size=batch_size,
                from_seq_length=seq_length,
                to_seq_length=seq_length)
                
        attention_head_gpu_gradient = tf.gradients(ys=attention_head_gpu, xs=layer_input)
        
        init_op = tf.group(tf.compat.v1.global_variables_initializer(),
                          tf.compat.v1.local_variables_initializer())
        sess.run(init_op)
        for _ in range(FLAGS.iter):
            sess.run(attention_head_gpu_gradient)
Example #18
0
    def build_graph(self):
        self.build_datapipeline()

        xb, yb = self.dataset_iterator.get_next()
        logits = self.model(xb)
        self.loss = self.loss_func(yb, logits)

        self.variables = [(v, i)
                          for i, v in enumerate(tf.trainable_variables())
                          if 'dense' in v.name]
        # dont apply to last dense layer
        self.variables.pop(-1)
        self.vs = [
            tf.random.normal((v.shape.as_list()[-1], 1), mean=0., stddev=1.)
            for v, _ in self.variables
        ]

        assert len(self.variables) > 0
        # spectral norm reg
        grads = tf.gradients(self.loss, tf.trainable_variables())
        new_vs = []
        for (var, idx), v in zip(self.variables, self.vs):
            original_shape = grads[idx].shape
            W_grad = tf.reshape(grads[idx], [-1, var.shape[-1]])
            W = tf.reshape(var, [-1, var.shape[-1]])

            u = W @ v
            v = tf.transpose(W) @ u
            sigma = tf.norm(u, 2) / tf.norm(v, 2)
            reg_value = sigma * (u @ tf.transpose(v))
            W_grad += self.reg_constant * reg_value

            grads[idx] = tf.reshape(W_grad, original_shape)
            new_vs.append(v)
        self.vs = new_vs

        self.acc, self.acc_op = tf.metrics.accuracy(tf.argmax(yb, 1),
                                                    tf.argmax(logits, 1),
                                                    name='acc')
        self.acc_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES,
                                          scope="acc")
        self.acc_initializer = tf.variables_initializer(var_list=self.acc_vars)

        self.train_op = self.optimizer.apply_gradients(
            zip(grads, tf.trainable_variables()))
Example #19
0
    def test_lnorm(self):
        with self.session() as sess:
            with tf.device('/GPU:0'):

                hidden_size = FLAGS.hidden_size

                #initialize x_trf
                x_trf = init_weights([hidden_size, hidden_size])

                context_layer_gpu = layer_norm(x_trf)
                context_layer_gpu_gradient = tf.gradients(ys=context_layer_gpu,
                                                          xs=x_trf)

                init_op = tf.group(tf.compat.v1.global_variables_initializer(),
                                   tf.compat.v1.local_variables_initializer())
                sess.run(init_op)
                for _ in range(FLAGS.iter):
                    sess.run(context_layer_gpu_gradient)
Example #20
0
    def train_step(x, y):
        y_hat = model(x, training=True)
        loss = loss_fn(y, y_hat)
        all_vars = []
        for v in model.trainable_variables:
            all_vars.append(v)
        grads = tf.gradients(loss, all_vars)
        #grads = tf.gradients(tf.negative(loss), all_vars)  #gradient ascent attack
        update = optimizer.apply_gradients(zip(grads, all_vars))

        #new_grads = grads      #sign-flipping attack
        #i=0
        #for tmp in grads:
        #    new_grads[i] = tf.reverse(tmp, [0])
        #    i+=1
        #update = optimizer.apply_gradients(zip(new_grads, all_vars))
        
        return loss, optimizer.iterations, update, y, y_hat
Example #21
0
    def testTwoOps(self):
        """Tests that the op can be instantiated twice with appropriate results.

    Implementations with inappropriate global registration of gradients will
    fail this test.
    """

        x = tf.placeholder(tf.float32, [1])
        y = x * x
        y = snt.scale_gradient(y, 0.1)
        y = snt.scale_gradient(y, 0.1)
        dydx = tf.gradients([y], [x])[0]

        with self.test_session() as sess:
            dydx_, y_ = sess.run([dydx, y], feed_dict={x: [3.0]})

            self.assertAlmostEqual(dydx_[0], 2 * 0.1**2 * 3.0, places=6)
            self.assertAlmostEqual(y_[0], 3.0**2, places=6)
    def build(self, num_inputs, num_outputs, num_targets):

        # build a new graph and session in which output neuron training will take place
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)
        with self.graph.as_default():

            # variables common to all neurons
            self.targets = tf.placeholder(dtype=dtype,
                                          shape=[None, num_targets],
                                          name='targets')
            self.inputs, self.neurons, self.losses, self.train_ops, self.place_and_assign, variables = \
                [], [], [], [], [], []

            # group neurons based on their inputs
            for each_num_inputs in num_inputs:
                inputs = tf.placeholder(dtype=dtype,
                                        shape=[None, each_num_inputs],
                                        name='inputs')
                self.inputs.append(inputs)

                # group neurons based on their activations
                for activation in self.activations:

                    # construct prediction of output neuron
                    y_pred, weights, placeholders, assign_ops = self._build_assignable_perceptron(
                        inputs, num_outputs, activation, self.weight_init)
                    self.neurons.append((weights, activation))
                    self.place_and_assign.append((placeholders, assign_ops))

                    # construct loss function
                    loss = reg_loss = self.loss_function(self.targets, y_pred)
                    self.losses.append(loss)
                    if self.regularizer is not None:
                        reg_loss = reg_loss + self.reg_penalty * self.regularizer(
                            weights[0])

                    # construct optimizer
                    self.train_ops.append(
                        (reg_loss, tf.gradients(reg_loss, weights)))
                    variables.extend(weights)

            # initialize variables
            self.sess.run(tf.variables_initializer(variables))
Example #23
0
    def _compute_gradients(self, loss, var_list=None):
        """Compute gradients using RTRL. The default compute_gradients method of
       the given optimizer will be called. Besides, a Block in the register
       may add additional gradients to its corresponding weight gradients if
       a `` method if provided.

    :param loss: the loss tensor
    :param var_list: variable list (may not be used for now)
    :return: A list of (gradient, variable) pairs as the compute_gradients
              method does in tf.Optimizer
    """
        # Sanity check
        assert isinstance(loss, tf.Tensor)

        # Compute gradients using default method
        assert isinstance(self._register, NodeRegister)
        default_grads_and_vars = self._tf_optimizer.compute_gradients(
            loss, var_list=self._register.default_var_list)

        # Compute gradients using customized method held
        dL_dy = tf.gradients(loss, self._rnn.last_scan_output)[0]
        c_g_n_v, new_buffer = self._register.compute_customized_gradient(dL_dy)
        self._rnn.grad_buffer_slot.plug(new_buffer)

        grads_and_vars = default_grads_and_vars + c_g_n_v
        if th.test_grad:
            _grads_and_vars = self._tf_optimizer.compute_gradients(loss)
            deltas_and_vars = []
            deltas = []
            for _g, _v in _grads_and_vars:
                matches = [g for g, v in grads_and_vars if v is _v]
                assert len(matches) == 1
                g = matches[0]

                delta_name = '_'.join(_v.name.split('/'))
                delta = tf.subtract(g,
                                    _g,
                                    name='delta_{}'.format(delta_name[:-2]))
                deltas_and_vars.append((delta, _v))
                deltas.append(delta)

            self._rnn.grad_delta_slot.plug(tuple(deltas))

        return grads_and_vars
Example #24
0
def create_optimizer(loss,
                     learning_rate,
                     num_train_steps,
                     weight_decay_rate=0.0,
                     use_tpu=False,
                     warmup_steps=0,
                     warmup_proportion=0,
                     lr_decay_power=1.0,
                     layerwise_lr_decay_power=-1,
                     n_transformer_layers=None):
    """Creates an optimizer and training op."""
    global_step = tf.train.get_or_create_global_step()
    learning_rate = tf.train.polynomial_decay(learning_rate,
                                              global_step,
                                              num_train_steps,
                                              end_learning_rate=0.0,
                                              power=lr_decay_power,
                                              cycle=False)
    warmup_steps = max(num_train_steps * warmup_proportion, warmup_steps)
    learning_rate *= tf.minimum(
        1.0,
        tf.cast(global_step, tf.float32) / tf.cast(warmup_steps, tf.float32))

    if layerwise_lr_decay_power > 0:
        learning_rate = _get_layer_lrs(learning_rate, layerwise_lr_decay_power,
                                       n_transformer_layers)
    optimizer = AdamWeightDecayOptimizer(
        learning_rate=learning_rate,
        weight_decay_rate=weight_decay_rate,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
    if use_tpu:
        optimizer = tf.tpu.CrossShardOptimizer(optimizer)

    tvars = tf.trainable_variables()
    grads = tf.gradients(loss, tvars)
    (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
    train_op = optimizer.apply_gradients(zip(grads, tvars),
                                         global_step=global_step)
    new_global_step = global_step + 1
    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
    return train_op
 def test1DAreaMax(self):
   batch_size = 256
   feature_len = 100
   heads = 2
   key_len = 15
   depth = 128
   max_area_width = 3
   queries = tf.random_uniform([batch_size, heads, key_len, depth],
                               minval=-10.0, maxval=10.0)
   features = tf.random_uniform([batch_size, heads, feature_len, depth],
                                minval=-10.0, maxval=10.0)
   feature_length = tf.constant(
       np.concatenate(
           (np.random.randint(max_area_width, feature_len, [batch_size - 1]),
            np.array([feature_len])), axis=0), tf.int32)
   base_mask = tf.expand_dims(tf.sequence_mask(feature_length), 1)
   mask = tf.expand_dims(base_mask, 3)
   mask = tf.tile(mask, [1, heads, 1, depth])
   features = tf.where(mask, features, tf.zeros_like(features))
   # [batch, 1, 1, memory_length]
   bias_mask = tf.expand_dims(base_mask, 1)
   bias = tf.where(
       bias_mask,
       tf.zeros_like(bias_mask, tf.float32),
       tf.ones_like(bias_mask, tf.float32) * -1e9)
   target_values = tf.random_uniform([batch_size, heads, key_len, depth],
                                     minval=-0.2, maxval=0.2)
   keys = tf.layers.dense(features, units=depth)
   values = tf.layers.dense(features, units=depth)
   max_attention = area_attention.dot_product_area_attention(
       queries, keys, values,
       bias=bias,
       area_key_mode="max",
       area_value_mode="max",
       name="max_key",
       max_area_width=max_area_width)
   max_gradients = tf.gradients(
       tf.reduce_mean(
           tf.pow(target_values - max_attention, 2)), features)
   with self.test_session() as session:
     session.run(tf.global_variables_initializer())
     result1, result2 = session.run([max_gradients, max_attention])
   self.assertFalse(np.any(np.logical_not(np.isfinite(result1))))
   self.assertFalse(np.any(np.logical_not(np.isfinite(result2))))
    def testTrainingFalse(self, ctor, depth):
        batch_size = 2
        out_channels = 2048 if depth >= 50 else 512
        expected_output_shape = [batch_size, out_channels]
        inputs = tf.random.uniform(dtype=tf.float32,
                                   shape=[batch_size, 224, 224, 3],
                                   seed=1)
        resnet = ctor(depth)
        outputs = resnet(inputs, training=False)
        gradient = tf.gradients(outputs, inputs)
        self.assertListEqual(expected_output_shape, outputs.shape.as_list())
        self.assertEqual(inputs.dtype, outputs.dtype)
        self.assertIsNotNone(gradient)

        with self.cached_session() as sess:
            sess.run(tf.compat.v1.global_variables_initializer())
            outputs = sess.run(outputs)
            # Make sure that there are no NaNs
            self.assertFalse(np.isnan(outputs).any())
Example #27
0
    def __init__(self, sess, state_dim, action_dim, learning_rate):
        self.sess = sess
        self.s_dim = state_dim
        self.a_dim = action_dim
        self.lr_rate = learning_rate

        # Create the actor network
        self.inputs, self.out = self.create_actor_network()

        # Get all network parameters
        self.network_params = \
            tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor')

        # Set all network parameters
        self.input_network_params = []
        for param in self.network_params:
            self.input_network_params.append(
                tf.placeholder(tf.float32, shape=param.get_shape()))
        self.set_network_params_op = []
        for idx, param in enumerate(self.input_network_params):
            self.set_network_params_op.append(
                self.network_params[idx].assign(param))

        # Selected action, 0-1 vector
        self.acts = tf.placeholder(tf.float32, [None, self.a_dim])

        # This gradient will be provided by the critic network
        self.act_grad_weights = tf.placeholder(tf.float32, [None, 1])

        # Compute the objective (log action_vector and entropy)
        self.obj = tf.reduce_sum(tf.multiply(
                       tf.log(tf.reduce_sum(tf.multiply(self.out, self.acts),
                                            reduction_indices=1, keep_dims=True)),
                       -self.act_grad_weights)) \
                   + ENTROPY_WEIGHT * tf.reduce_sum(tf.multiply(self.out,
                                                           tf.log(self.out + ENTROPY_EPS)))

        # Combine the gradients here
        self.actor_gradients = tf.gradients(self.obj, self.network_params)

        # Optimization Op
        self.optimize = tf.train.RMSPropOptimizer(self.lr_rate).\
            apply_gradients(zip(self.actor_gradients, self.network_params))
Example #28
0
def jacobian_graph(predictions, x, nb_classes):
    """
  Create the Jacobian graph to be ran later in a TF session
  :param predictions: the model's symbolic output (linear output,
      pre-softmax)
  :param x: the input placeholder
  :param nb_classes: the number of classes the model has
  :return:
  """

    # This function will return a list of TF gradients
    list_derivatives = []

    # Define the TF graph elements to compute our derivatives for each class
    for class_ind in xrange(nb_classes):
        derivatives, = tf.gradients(predictions[:, class_ind], x)
        list_derivatives.append(derivatives)

    return list_derivatives
Example #29
0
    def compute_gradient(self, objective, argument):
        """
        Compute the gradient of 'objective' and return as a function.
        """
        tfgrad = tf.gradients(objective, argument)

        if not isinstance(argument, list):

            def grad(x):
                feed_dict = {argument: x}
                return self._session.run(tfgrad[0], feed_dict)

        else:

            def grad(x):
                feed_dict = {i: d for i, d in zip(argument, x)}
                return self._session.run(tfgrad, feed_dict)

        return grad
Example #30
0
 def __init__(self, sess, state_dim, action_dim, learning_rate):
     self.sess = sess
     self.state_dim = state_dim
     self.action_dim = action_dim
     self.learning_rate = learning_rate
     #크리틱 신경망 생성
     self.model, self.phi, self.states = build_network(self.state_dim)
     #시간차 타깃을 담을 플레이스 홀더
     self.td_targets = tf.placeholder(tf.float32, [None, 1])
     #글로벌 손실함수와 그래디언트
     v_values = self.model.output
     loss = tf.reduce_sum(tf.square(self.td_targets - v_values))
     dj_dphi = tf.gradients(loss, self.phi)
     #그래디언트 클리핑
     dj_dphi, _ = tf.clip_by_global_norm(dj_dphi, 40)
     #그래디언트를 이용한 글로벌 신경망 업데이트
     grads = zip(dj_dphi, self.phi)
     self.critic_optimizer = tf.train.AdamOptimizer(
         self.learning_rate).apply_gradients(grads)