Beispiel #1
0
def prepGeometricTensor(state, vrs):
    nparams = prep_variables(vrs)
    phi_r = tf.math.real(state)
    phi_c = tf.math.imag(state)
    jac_r = jacobian(phi_r, vrs)
    jac_c = jacobian(phi_c, vrs)
    if len(vrs) == 1:
        jac = tf.reshape(tf.complex(jac_r, jac_c), (state.shape + [nparams]))
        jac = tf.split(jac, nparams, axis=-1)
        jac = [tf.reshape(v, state.shape) for v in jac]
    else:
        jac = [tf.complex(jac_r[i], jac_c[i]) for i in range(nparams)]
    return jac, nparams
Beispiel #2
0
 def test_jacobian_fixed_shape(self):
   x = random_ops.random_uniform([2, 2])
   y = math_ops.matmul(x, x, transpose_a=True)
   jacobian_pfor = gradients.jacobian(y, x, use_pfor=True)
   jacobian_while = gradients.jacobian(y, x, use_pfor=False)
   answer = ops.convert_to_tensor([[
       gradient_ops.gradients(y[0][0], x)[0],
       gradient_ops.gradients(y[0][1], x)[0]
   ], [
       gradient_ops.gradients(y[1][0], x)[0],
       gradient_ops.gradients(y[1][1], x)[0]
   ]])
   self.run_and_assert_equal(answer, jacobian_pfor)
   self.run_and_assert_equal(answer, jacobian_while)
Beispiel #3
0
def create_lstm_hessian(batch_size, state_size, steps):
    _, output = lstm_model_fn(batch_size, state_size, steps)
    weights = variables.trainable_variables()
    pfor_jacobians = gradients.jacobian(output, weights, use_pfor=True)
    pfor_hessians = [
        gradients.jacobian(x, weights, use_pfor=True) for x in pfor_jacobians
    ]
    # TODO(agarwal): using two nested while_loop doesn't seem to work here.
    # Hence we use pfor_jacobians for computing while_hessians.
    while_jacobians = pfor_jacobians
    while_hessians = [
        gradients.jacobian(x, weights, use_pfor=False) for x in while_jacobians
    ]
    return pfor_hessians, while_hessians
def create_lstm_hessian(batch_size, state_size, steps):
  _, output = lstm_model_fn(batch_size, state_size, steps)
  weights = variables.trainable_variables()
  pfor_jacobians = gradients.jacobian(output, weights, use_pfor=True)
  pfor_hessians = [
      gradients.jacobian(x, weights, use_pfor=True) for x in pfor_jacobians
  ]
  # TODO(agarwal): using two nested while_loop doesn't seem to work here.
  # Hence we use pfor_jacobians for computing while_hessians.
  while_jacobians = pfor_jacobians
  while_hessians = [
      gradients.jacobian(x, weights, use_pfor=False) for x in while_jacobians
  ]
  return pfor_hessians, while_hessians
Beispiel #5
0
    def _build_hessian_op_grid(self, y, wrt_x1, wrt_x2):
        # DEBUG: self.monolith_hessian = jacobian(tf.gradients(y, wrt_x1)[0], wrt_x2, use_pfor=False)
        # DEBUG: self.monolith_hessian = tf.reshape(self.monolith_hessian, shape=(self._full_height, self._full_width))

        # Compute the full gradient wrt x1. We will slice this later when computing the hessian in a blocky manner.
        full_gradient = tf.gradients(y, wrt_x1)[0]
        full_gradient = tf.reshape(full_gradient, shape=(self._full_height, ))

        # Redefining for readability
        grid_height = self._grid_height
        block_height = self._block_height
        full_height = self._full_height

        # Build a grid of Tensorflow operations - each operation computes a chunk of the Hessian.
        #   The grid is actually just a list of chunks of shape [block_height, n_elements(x2)].
        #   When assembled, the hessian chunk will have shape [n_elements(x1), n_elements(x2)].
        op_grid = [None] * grid_height
        for i_block in range(grid_height):
            # Parameters for the j-axis of the hessian.
            i_start_ix = i_block * block_height
            i_end_ix = min(full_height, (i_block + 1) * block_height)

            # Add to output op grid.
            grad_chunk = full_gradient[i_start_ix:i_end_ix]
            # TODO: it is not clear whether to use pfor or not (since it is still experimental - maybe we can test).
            #  See https://github.com/tensorflow/tensorflow/issues/675#issuecomment-404665051
            hess_chunk = jacobian(grad_chunk, wrt_x2, use_pfor=False)
            hess_chunk = tf.reshape(hess_chunk,
                                    shape=(grad_chunk.shape[0],
                                           self._full_width))

            op_grid[i_block] = hess_chunk

        return op_grid
Beispiel #6
0
 def caljacob(self, z):
     is_training = False
     jacob = self.sess.run(jacobian(self.x_hat, self.z),
                           feed_dict={
                               self.z: z,
                               self.is_training: is_training
                           })
     return jacob
Beispiel #7
0
 def inverse(self, z):
     q, p = extract_q_p(x)
     q_prime = self._f.inverse(q)
     df = tf_gradients_ops.jacobian(self._f(q_prime),
                                    q_prime,
                                    use_pfor=True)
     return join_q_p(q_prime,
                     tf.tensordot(df, p, [[4, 5, 6, 7], [0, 1, 2, 3]]))
Beispiel #8
0
def run_pyket(args):
    hilbert_state_shape = (args.input_size, 1)
    padding = ((0, args.kernel_size - 1), )
    inputs = Input(shape=hilbert_state_shape, dtype='int8')
    x = ToComplex128()(inputs)
    for i in range(args.depth):
        x = PeriodicPadding(padding)(x)
        x = ComplexConv1D(args.width,
                          args.kernel_size,
                          use_bias=False,
                          dtype=tf.complex128)(x)
        x = Activation(lncosh)(x)
    x = Flatten()(x)
    predictions = Lambda(lambda y: tf.reduce_sum(y, axis=1, keepdims=True))(x)
    model = Model(inputs=inputs, outputs=predictions)
    if args.fast_jacobian:
        predictions_jacobian = lambda x: get_predictions_jacobian(keras_model=
                                                                  model)
    else:
        predictions_jacobian = lambda x: gradients.jacobian(
            tf.real(model.output), x, use_pfor=not args.no_pfor)
    if args.use_stochastic_reconfiguration:
        optimizer = ComplexValuesStochasticReconfiguration(
            model,
            predictions_jacobian,
            lr=args.learning_rate,
            diag_shift=10.0,
            iterative_solver=args.use_iterative,
            use_cholesky=args.use_cholesky,
            iterative_solver_max_iterations=None)
        model.compile(optimizer=optimizer,
                      loss=loss_for_energy_minimization,
                      metrics=optimizer.metrics)
    else:
        optimizer = SGD(lr=args.learning_rate)
        model.compile(optimizer=optimizer, loss=loss_for_energy_minimization)
    model.summary()
    operator = Heisenberg(hilbert_state_shape=hilbert_state_shape, pbc=True)
    sampler = MetropolisHastingsHamiltonian(
        model,
        args.batch_size,
        operator,
        num_of_chains=args.pyket_num_of_chains,
        unused_sampels=numpy.prod(hilbert_state_shape))
    variational_monte_carlo = VariationalMonteCarlo(model, operator, sampler)
    model.fit_generator(variational_monte_carlo.to_generator(),
                        steps_per_epoch=5,
                        epochs=1,
                        max_queue_size=0,
                        workers=0)
    start_time = time.time()
    model.fit_generator(variational_monte_carlo.to_generator(),
                        steps_per_epoch=args.num_of_iterations,
                        epochs=1,
                        max_queue_size=0,
                        workers=0)
    end_time = time.time()
    return end_time - start_time
Beispiel #9
0
 def call(self, x):
     q, p = extract_q_p(x)
     q_prime = self._f(q)
     # Df(q)^{-1} = D(f^{-1}( q_prime ))
     df_inverse = tf_gradients_ops.jacobian(self._f.inverse(q_prime),
                                            q_prime,
                                            use_pfor=True)
     return join_q_p(
         q_prime, tf.tensordot(df_inverse, p, [[4, 5, 6, 7], [0, 1, 2, 3]]))
Beispiel #10
0
 def test_jacobian_unknown_shape(self):
   with self.test_session() as sess:
     x = array_ops.placeholder(dtypes.float32, shape=[None, None])
     y = math_ops.matmul(x, x, transpose_a=True)
     jacobian_pfor = gradients.jacobian(y, x, use_pfor=True)
     jacobian_while = gradients.jacobian(y, x, use_pfor=False)
     answer = ops.convert_to_tensor([[
         gradient_ops.gradients(y[0][0], x)[0],
         gradient_ops.gradients(y[0][1], x)[0]
     ], [
         gradient_ops.gradients(y[1][0], x)[0],
         gradient_ops.gradients(y[1][1], x)[0]
     ]])
     ans, pfor_value, while_value = sess.run(
         [answer, jacobian_pfor, jacobian_while],
         feed_dict={x: [[1, 2], [3, 4]]})
     self.assertAllClose(ans, pfor_value)
     self.assertAllClose(ans, while_value)
Beispiel #11
0
  def test_jacobian_scan_shape(self):
    # Shape x: [3, 4]
    x = random_ops.random_uniform([3, 4])
    elems = random_ops.random_uniform([6])
    # Shape y: [6, 3, 4]
    y = functional_ops.scan(lambda a, e: a + e, elems, initializer=x)
    jacobian = gradients.jacobian(y, x)

    expected_shape = [6, 3, 4, 3, 4]
    self.assertAllEqual(expected_shape, jacobian.shape.as_list())
Beispiel #12
0
    def test_jacobian_scan_shape(self):
        # Shape x: [3, 4]
        x = random_ops.random_uniform([3, 4])
        elems = random_ops.random_uniform([6])
        # Shape y: [6, 3, 4]
        y = functional_ops.scan(lambda a, e: a + e, elems, initializer=x)
        jacobian = gradients.jacobian(y, x)

        expected_shape = [6, 3, 4, 3, 4]
        self.assertAllEqual(expected_shape, jacobian.shape.as_list())
Beispiel #13
0
 def mixed_partials(self, conf):
     # optimized version to speed things up a little bit.
     grads = self.gradients(conf)
     reverse_shaped = jacobian(grads, self.params, use_pfor=False)
     properly_shaped = []
     for p in reverse_shaped:
         if len(p.get_shape()) == 3:
             properly_shaped.append(tf.transpose(p, perm=(2, 0, 1)))
         if len(p.get_shape()) == 4:
             # properly_shaped.append(tf.reshape(fixed, [-1, fixed.shape[2], fixed.shape[3]]))
             properly_shaped.append(tf.transpose(p, perm=(2, 3, 0, 1)))
     return properly_shaped
Beispiel #14
0
  def test_jacobian_while_loop_shape(self):
    # Shape x: [3, 4]
    x = random_ops.random_uniform([3, 4])
    _, y = tf_control_flow_ops.while_loop(lambda i, a: i > 5.,
                                          lambda i, a: (i + 1, a + i),
                                          (constant_op.constant(0.), x))
    # Shape y: [2, 3]
    y = y[:2, :3]
    jacobian = gradients.jacobian(y, x)

    expected_shape = [2, 3, 3, 4]
    self.assertAllEqual(expected_shape, jacobian.shape.as_list())
Beispiel #15
0
    def test_jacobian_while_loop_shape(self):
        # Shape x: [3, 4]
        x = random_ops.random_uniform([3, 4])
        _, y = tf_control_flow_ops.while_loop(lambda i, a: i > 5., lambda i, a:
                                              (i + 1, a + i),
                                              (constant_op.constant(0.), x))
        # Shape y: [2, 3]
        y = y[:2, :3]
        jacobian = gradients.jacobian(y, x)

        expected_shape = [2, 3, 3, 4]
        self.assertAllEqual(expected_shape, jacobian.shape.as_list())
Beispiel #16
0
def test_sorting():
    # convert to TF tensors
    dtype = tf.float64
    tf_matrices = bitonic_matrices(8)
    for max_fn in [softmax, smoothmax, softmax_smooth]:
        test = to_tf(np.random.randint(-200, 200, 8), dtype=dtype)
        tf_output = tf.reshape(diff_sort(tf_matrices, test), (-1,))
        tf_ranks = diff_argsort(tf_matrices, test)
        tf_argsort = diff_argsort(tf_matrices, test, transpose=True)
        tf_grads = tf.squeeze(jacobian(tf_output, test))
        # compute output and gradient
        with tf.Session() as s:
            s.run((tf_output, tf_grads, tf_ranks, tf_argsort))
Beispiel #17
0
    def mixed_partials(self, conf):
        """
        Returns list of tensors of mixed partial derivatives evaluated at the input geometry.

        Parameters
        ----------
        conf: tf.placeholder
            An N x 3 configuration placeholder

        Returns
        -------
        tf.Tensor of size len(self.params)
            Returns an unflattened list of mixed partial derivatives [(p_shape), N, 3]
            matching each parameter in get_params()

        """
        # (ytz): Note for implementation purposes, the order of differentiation
        # actually matters. The jacobian system in tensorflow expects a fixed size
        # tensor for the outputs, while permitting a variable list of tensors for
        # inputs. This means that we should naturally use the coordinate derivatives
        # as they all have a fixed N x 3 structure, where as the input parameters
        # can take on a variadic list of tensors of varying sizes.

        # optimized version to speed things up a little bit.
        grads = self.gradients(conf)

        # taken from tf src gradients_impl.py _IndexedSlicesToTensor
        if isinstance(grads, tf.IndexedSlices):
            grads = tf.unsorted_segment_sum(grads.values, grads.indices,
                                            grads.dense_shape[0])

        reverse_shaped = jacobian(grads, self.params, use_pfor=False)

        if isinstance(reverse_shaped, tf.Tensor):
            # shove to a list
            reverse_shaped = [reverse_shaped]

        properly_shaped = []
        for p in reverse_shaped:
            if len(p.get_shape()) == 2:
                properly_shaped.append(p)  # already ready to go
            elif len(p.get_shape()) == 3:
                properly_shaped.append(tf.transpose(p, perm=(2, 0, 1)))
            elif len(p.get_shape()) == 4:
                # properly_shaped.append(tf.reshape(fixed, [-1, fixed.shape[2], fixed.shape[3]]))
                properly_shaped.append(tf.transpose(p, perm=(2, 3, 0, 1)))
            else:
                # should be easy to support, just add perm=(2,3,...,0,1)
                raise NotImplementedError("Shapes > 4 not supported")
        return properly_shaped
Beispiel #18
0
def jacobians(ys, xs, parallel_iterations=None):
    """Compute the jacobians of `ys` with respect to `xs`.

  Args:
    ys: tf.Tensor.
    xs: tf.Tensor. The variables wrt to compute the Jacobian.
    parallel_iterations: The number of iterations to be done in paralel. Used to
        trade-off memory consumption for speed: if None, the Jacobian
        computation is done in parallel, but requires most memory.

  Returns:
    a tf.Tensor of Jacobians.
  """
    return pfor_gradients.jacobian(ys,
                                   xs,
                                   use_pfor=True,
                                   parallel_iterations=parallel_iterations)
def create_fc_per_eg_jacobians(batch_size, activation_size, num_layers):
  model = FullyConnectedModel(activation_size=activation_size,
                              num_layers=num_layers)
  inp = random_ops.random_normal([batch_size, activation_size])
  output = model(inp)
  jacobians = gradients.jacobian(output, variables.trainable_variables())

  def loop_fn(i, use_pfor):
    inp_i = array_ops.expand_dims(array_ops.gather(inp, i), 0)
    output = array_ops.reshape(model(inp_i), [-1])
    return gradients.jacobian(
        output, variables.trainable_variables(), use_pfor=use_pfor)

  per_eg_jacobians_pfor = control_flow_ops.pfor(
      functools.partial(loop_fn, use_pfor=True),
      batch_size)
  per_eg_jacobians_while = control_flow_ops.for_loop(
      functools.partial(loop_fn, use_pfor=False),
      [dtypes.float32] * len(variables.trainable_variables()), batch_size)
  return jacobians, per_eg_jacobians_pfor, per_eg_jacobians_while
Beispiel #20
0
    def __init__(self, model_class, dataset, params):
        """
        Creates necessary ops for deepfool in the tensorflow graph
        Args:
            model_class: The class of the model to construct.
                         Expects subclass of BasicModel
            dataset: The dataset to use.
                     Only necessary for shape and number of classes
            params: Additional parameters to pass to the model init
        """
        self.image = tf.placeholder(dtype=tf.float32, shape=dataset.shape)

        self.model = model_class(tf.expand_dims(self.image, axis=0),
                                 trainable=False,
                                 num_classes=dataset.num_classes,
                                 **params)

        self.logits = self.model.logits[0]
        self.num_classes = self.logits.shape.as_list()[0]
        self.logits_grad = jacobian(self.logits, self.image)
Beispiel #21
0
def create_fc_per_eg_jacobians(batch_size, activation_size, num_layers):
    model = FullyConnectedModel(activation_size=activation_size,
                                num_layers=num_layers)
    inp = random_ops.random_normal([batch_size, activation_size])
    output = model(inp)
    jacobians = gradients.jacobian(output, variables.trainable_variables())

    def loop_fn(i, use_pfor):
        inp_i = array_ops.expand_dims(array_ops.gather(inp, i), 0)
        output = array_ops.reshape(model(inp_i), [-1])
        return gradients.jacobian(output,
                                  variables.trainable_variables(),
                                  use_pfor=use_pfor)

    per_eg_jacobians_pfor = control_flow_ops.pfor(
        functools.partial(loop_fn, use_pfor=True), batch_size)
    per_eg_jacobians_while = control_flow_ops.for_loop(
        functools.partial(loop_fn, use_pfor=False),
        [dtypes.float32] * len(variables.trainable_variables()), batch_size)
    return jacobians, per_eg_jacobians_pfor, per_eg_jacobians_while
Beispiel #22
0
def list_jacobian(outputs, inputs):
    """
    Parameters
    ----------
    outputs: tf.Tensor
    inputs: list of tf.Tensor

    Returns
    -------
    list of jacobians [
        tf.Tensor(p0_d0,p0_d1,...,N,3),
        tf.Tensor(p1_d0,p1_d2,...,N,3),
        ...
    ]

    """
    # This is a slightly more advanced version of tensorflow's jacobian system that allows
    # for sparse gradients as well as automatically reshaping the results if outputs is a list.

    # taken from tf src gradients_impl.py _IndexedSlicesToTensor
    outputs = densify(outputs)

    output_dims = list(range(len(outputs.get_shape().as_list())))  # [0,1]
    n_out_dims = len(output_dims)

    # if isinstance(reverse_shaped, tf.Tensor):
    # shove to a list
    # reverse_shaped = [reverse_shaped]

    result = []
    for inp, jac in zip(inputs, jacobian(outputs, inputs, use_pfor=False)):

        input_dims = list(range(len(inp.get_shape().as_list())))  # [0,1]
        perm = [(idx + n_out_dims) for idx in input_dims
                ] + output_dims  # generate permutation indices
        result.append(tf.transpose(jac, perm=perm))

    return result
def test_equal_to_builtin_jacobian(model_builder, batch_size):
    with DEFAULT_TF_GRAPH.as_default():
        keras_model = model_builder()
        keras_model.summary()
        gradient_per_example_t = gradient_per_example(
            tf.real(keras_model.output), keras_model)
        tensorflow_jacobian_t = gradients.jacobian(tf.real(keras_model.output),
                                                   keras_model.weights,
                                                   use_pfor=False)
        print(gradient_per_example_t)
        print(tensorflow_jacobian_t)
        gradient_per_example_func = K.function(inputs=[keras_model.input],
                                               outputs=gradient_per_example_t)
        tensorflow_jacobian_func = K.function(inputs=[keras_model.input],
                                              outputs=tensorflow_jacobian_t)
        size = (batch_size, ) + K.int_shape(keras_model.input)[1:]
        batch = np.random.rand(*size)
        gradient_per_example_vals = gradient_per_example_func([batch])
        tensorflow_jacobian_vals = tensorflow_jacobian_func([batch])
        allclose = [
            np.allclose(a, b, rtol=1e-3) for a, b in zip(
                gradient_per_example_vals, tensorflow_jacobian_vals)
        ]
        assert np.all(allclose)
Beispiel #24
0
    def compute_task_jacobian(self, policy_loss, policy_loss_quad, tangents):
        # compute hvp
        params1 = self.warmup_policy1.parameters()
        params2 = self.warmup_policy2.parameters()
        params3 = self.warmup_policy3.parameters()
        self.op_task_hvp_Ax = nn.utils.quadgrad_vec_prod(policy_loss_quad,
                                                         params1,
                                                         params2,
                                                         params3,
                                                         tangents,
                                                         AAx=False)
        task_hvp = nn.utils.quadgrad_vec_prod(policy_loss_quad,
                                              params1,
                                              params2,
                                              params3,
                                              tangents,
                                              AAx=True)
        if self.meanAAx:
            nparam = nn.utils.n_parameters_int(
                self.warmup_policy.parameters()).astype(np.float32)
            print("meanAAx nparam:", nparam)
            print(type(nparam))
            task_hvp = task_hvp / nparam
        print("task_hvp:", task_hvp)
        print("task_hvp_Ax:", self.op_task_hvp_Ax)
        self.op_quad_hvp = nn.utils.hessian_vec_prod(
            self.quad_loss, self.warmup_policy.parameters(), tangents)
        self.op_hessian_hvp = nn.utils.hessian_vec_prod(
            policy_loss, self.warmup_policy.parameters(), tangents)
        self.op_hvp = nn.utils.hessian_vec_prod(
            self.mean_kl, self.warmup_policy.parameters(), tangents)

        # compute jacobian
        task_params = self.task.parameters()
        policy_params = self.warmup_policy.parameters()
        policy_gradient_flat = nn.utils.parameters_to_vector(
            tf.gradients(policy_loss, policy_params))
        self.pg_flat = policy_gradient_flat
        self.p_flat = nn.utils.parameters_to_vector(
            self.warmup_policy.parameters())
        print("pg_flat:", self.pg_flat)
        print("p_flat:", self.p_flat)

        task_jacobian = jacobian(policy_gradient_flat,
                                 task_params,
                                 use_pfor=False)
        print(task_jacobian.shape)
        task_jacobian = tf.reshape(task_jacobian, (-1, self.task.n_dim))
        if self.AAx:
            ATb = []
            for i in range(self.task.n_dim):
                ATb_i = nn.utils.quadgrad_vec_prod(policy_loss_quad,
                                                   params1,
                                                   params2,
                                                   params3,
                                                   task_jacobian[:, i],
                                                   AAx=False)
                if self.meanAAx:
                    ATb_i = ATb_i / nn.utils.n_parameters_int(
                        self.warmup_policy.parameters()).astype(np.float32)
                ATb.append(ATb_i)
            self.op_ATb = tf.stack(ATb, axis=0)  #n_dim x |theta|
            print(self.op_ATb.shape)
            #exit(0)

        #############################################################
        jacobian_op = []
        task_gradients = nn.utils.parameters_to_vector(
            tf.gradients(policy_loss, task_params))
        self.task_gradients = task_gradients
        for i in range(self.task.n_dim):
            b = nn.utils.parameters_to_vector(
                tf.gradients(task_gradients[i], policy_params))
            jacobian_op.append(b)
        self.jacobian_op = tf.stack(jacobian_op, axis=0)
        print("jacobian_op:", self.jacobian_op)
        print("task_gradients:", self.task_gradients)
        print("task_jacobian:", task_jacobian)
        #############################################################

        return task_jacobian, task_hvp
Beispiel #25
0
 def test_jacobian_parallel_iterations(self):
   x = constant_op.constant([[1., 2], [3, 4]])
   y = math_ops.matmul(x, x)
   self.assertAllClose(gradients.jacobian(y, x, parallel_iterations=2),
                       gradients.jacobian(y, x, parallel_iterations=3))
Beispiel #26
0
 def jacobian(x):
     return gradients.jacobian(tensorflow.math.real(x),
                               params,
                               use_pfor=self.use_pfor)
Beispiel #27
0
 def test_indexed_slice(self):
     inp = random_ops.random_uniform([3, 2])
     output = nn.embedding_lookup(inp, [0, 2])
     pfor_jacobian = gradients.jacobian(output, inp, use_pfor=True)
     while_jacobian = gradients.jacobian(output, inp, use_pfor=False)
     self.run_and_assert_equal(while_jacobian, pfor_jacobian)
Beispiel #28
0
 def test_jacobian_parallel_iterations(self):
     x = constant_op.constant([[1., 2], [3, 4]])
     y = math_ops.matmul(x, x)
     self.assertAllClose(gradients.jacobian(y, x, parallel_iterations=2),
                         gradients.jacobian(y, x, parallel_iterations=3))
Beispiel #29
0
def CTC_Loss():
    # ctc_loss v1에서는 sparse matrix가 들어가기 때문에, gt(label)에 0번 character가 포함되어 있으면, 0번에 대한 loss를 계산못한다.
    # v2에서도 sparse를 넣어주면 같은 결과가 나온다.
    # 이는 0번을 padding으로 인식하는 문제가 있기 때문이다.
    # 따라서, 0번에는 의미 있는 charcter를 부여하면 안된다.
    # v2에서 label에 sparse가 아닌, dense를 넣어주어야 한다.

    batch_size = 2
    output_T = 5
    target_T = 3  # target의 길이. Model이 만들어 내는 out_T는 target보다 길다.
    num_class = 4  # 0, 1, 2는 character이고, 마지막 3은 blank이다.

    x = np.arange(40).reshape(batch_size, output_T,
                              num_class).astype(np.float32)
    x = np.random.randn(batch_size, output_T, num_class)
    x = np.array([[[0.74273746, 0.07847633, -0.89669566, 0.87111101],
                   [0.35377891, 0.87161664, 0.45004634, -0.01664156],
                   [-0.4019564, 0.59862392, -0.90470981, -0.16236736],
                   [0.28194173, 0.82136263, 0.06700599, -0.43223688],
                   [0.1487472, 1.04652007, -0.51399114, -0.4759599]],
                  [[-0.53616811, -2.025543, -0.06641838, -1.88901458],
                   [-0.75484499, 0.24393693, -0.08489008, -1.79244747],
                   [0.36912486, 0.93965647, 0.42183299, 0.89334628],
                   [-0.6257366, -2.25099419, -0.59857886, 0.35591563],
                   [0.72191422, 0.37786281, 1.70582983,
                    0.90937337]]]).astype(np.float32)

    xx = tf.convert_to_tensor(x)
    xx = tf.Variable(xx)
    logits = tf.transpose(xx, [1, 0, 2])

    yy = np.random.randint(0, num_class - 1,
                           size=(batch_size,
                                 target_T))  # low=0, high=3 ==> 0,1,2
    yy = np.array([[1, 2, 2], [1, 0, 1]]).astype(np.int32)
    #yy = np.array([[1, 2, 2,0,0,0],[1,0,2,0,0,0]]).astype(np.int32)  # 끝에 붙은 0은 pad로 간주한다. 중간에 있는 0은 character로 간주

    zero = tf.constant(0, dtype=tf.int32)
    where = tf.not_equal(yy, zero)
    indices = tf.where(where)
    values = tf.gather_nd(yy, indices)
    targets = tf.SparseTensor(indices, values, yy.shape)

    # preprocess_collapse_repeated=False  ---> label은 반복되는 character가 있을 수 있으니, 당연히 False
    # ctc_merge_repeated=False  ---> 모델이 예측한 반복된 character를 merge하지 않는다. 이것은 ctc loss의 취지와 다르다.
    loss0 = tf.nn.ctc_loss(labels=targets,
                           inputs=logits,
                           sequence_length=[output_T] * batch_size,
                           ctc_merge_repeated=False)
    # 이 loss0는 의미 없음.

    loss1 = tf.nn.ctc_loss(labels=targets,
                           inputs=logits,
                           sequence_length=[output_T] * batch_size)
    loss2 = tf.nn.ctc_loss_v2(labels=yy,
                              logits=logits,
                              label_length=[target_T] * batch_size,
                              logit_length=[output_T] * batch_size,
                              logits_time_major=True,
                              blank_index=num_class - 1)

    # lables에 sparse tensor를 넣으면, v1과 결과가 같다.
    loss3 = tf.nn.ctc_loss_v2(labels=targets,
                              logits=logits,
                              label_length=[3, 3],
                              logit_length=[output_T] * batch_size,
                              logits_time_major=True,
                              blank_index=num_class - 1)

    optimizer = tf.train.GradientDescentOptimizer(learning_rate=1)
    gradient = optimizer.compute_gradients(loss1)

    prob = tf.nn.softmax(xx, axis=-1)
    # jacobian을 이용해서 logits에 대한 softmax값의 미분을 구한다.
    a = xx[0, 1]
    b = tf.nn.softmax(a)
    grad = jacobian(b, a)

    # logit에 대한 미분을 softmax에 대한 미분으로 변환하기 위해 grad의 inverse를 곱한다.
    # grad의 역행렬이 존재하지 않는다.

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    l0 = sess.run(loss0)
    l1 = sess.run(loss1)
    l2 = sess.run(loss2)
    l3 = sess.run(loss3)
    print('loss: ', l0, l1, l2, l3)
    g = sess.run(gradient[0][0])
    p = sess.run(prob)
    gg = sess.run(grad)
def model_l1_l2_func(nm_set_points, n_in, nn_1, opt_obj, **kwargs_vals):
    hess_approx_flag = False
    neurons_cnt_x1, initializer = kwargs_vals['neurons_cnt'], kwargs_vals[
        'initializer']
    wb_sizes_classif, wb_shapes = kwargs_vals['sizes'], kwargs_vals['shapes']
    x_trained = kwargs_vals['xtr']
    y_trained = kwargs_vals['ytr']
    sess_values = kwargs_vals['sess']
    neurons_cnt = kwargs_vals['neurons_cnt']
    # pcsv = np.genfromtxt('results_paramsparse.csv', delimiter='\t')
    # p = tf.Variable(pcsv, dtype=tf.float64)
    p = tf.Variable(initializer([neurons_cnt], dtype=tf.float64))
    p_store = tf.Variable(tf.zeros([neurons_cnt_x1], dtype=tf.float64))
    save_params_p = tf.assign(p_store, p)
    restore_params_p = tf.assign(p, p_store)
    I_mat = tf.eye(neurons_cnt_x1, dtype=tf.float64)
    shaped_new = np.int(wb_sizes_classif[0]) + np.int(wb_sizes_classif[1])
    # l2_norm_val, all_reg0 = func_structured_l2pen(p, wb_sizes_classif, wb_shapes)
    lambda_param = kwargs_vals['lambda_param']
    lambda_param2 = kwargs_vals['lambda_param2']
    # all_reg_0 = tf.reduce_sum(tf.abs(lasso_p))
    # l2 structured norm loss function
    y_hat_model, y_hat_model_flat_x, y_labeled, x_in, r, l2_norm_val, all_reg0, l2_p, lassop = func_mse_l2(
        n_in, p, nn_1, kwargs_vals)
    r1 = y_labeled - y_hat_model
    loss_val = tf.reduce_sum(
        tf.square(r1)) + lambda_param * all_reg0 + lambda_param2 * l2_norm_val
    mu = tf.placeholder(tf.float64, shape=[1])  # LM parameter
    # initialized store for all params, grad and hessian to be trained
    feed_dict = {x_in: x_trained, y_labeled: y_trained}

    if hess_approx_flag:
        jcb = jacobian(y_hat_model, p)
        grads = tf.stack(
            [tf.gradients(yi, p)[0] for yi in tf.unstack(y_hat_model, axis=1)],
            axis=1)
        print(grads.shape)
        # g_vals = sess_values.run(grads, feed_dict=feed_dict)
        t_jcb = tf.matmul(tf.transpose(jcb), jcb)
        j1 = jacobian_mse(y_hat_model, p, nm_set_points, wb_sizes_classif,
                          wb_shapes)
        jt = tf.transpose(j1)
        partitioned = tf.dynamic_partition(j1,
                                           nm_set_points,
                                           1,
                                           name='dynamic_unstack')
        print(len(partitioned))
        l2_grad = tf.gradients(l2_norm_val, l2_p)[0]
        dxdt = tf.expand_dims(tf.gradients(all_reg0, lassop)[0], 1)
        hess_l2_ps = tf.hessians(l2_norm_val, l2_p)[0]
        print('The shape is;', j1.shape)
        jtj1 = tf.matmul(jt, j1)
        jtr1 = 2 * tf.matmul(jt, r1)
        l2grad = tf.expand_dims(l2_grad, 1)
        s_l2grad = tf.matmul(l2grad, tf.transpose(l2grad))
        # compute gradient of l2 params
        reshaped_gradl2 = jtr1[0:shaped_new]
        reshaped_l20 = reshaped_gradl2 + lambda_param2 * l2grad  # l2_p_grads, 1)
        # build another hessian
        jt_hess = jt[0:shaped_new] + lambda_param2 * l2grad  # l2_p_grads, 1)
        jt_hess_end = tf.concat([jt_hess, jt[shaped_new:, :]], axis=0)
        j1_t = tf.transpose(jt_hess_end)
        # calculate gradient for lasso params group
        reshaped_gradl1 = jtr1[shaped_new:]
        reshaped_gradl0 = reshaped_gradl1 + lambda_param * dxdt  # tf.expand_dims(dxdt, 1) #tf.sign(lasso_p), 1)
        # Assemble the lasso group
        jtj = tf.matmul(jt_hess_end, j1_t)
        jtr = tf.concat([reshaped_l20, reshaped_gradl0], axis=0)
        jtr = tf.reshape(jtr, shape=(neurons_cnt_x1, 1))
        # The other hess using hessian for in --> hid1
        hess_part2 = jtj1[0:shaped_new,
                          0:shaped_new] + s_l2grad  #hess_l2_ps# + h_mat_l2
        hess_partsconc = tf.concat(
            [hess_part2, jtj1[0:shaped_new, shaped_new:]], axis=1)
        jtj3 = tf.concat([hess_partsconc, jtj1[shaped_new:, :]], axis=0)
        # remove it
    else:
        # remove it
        # stop_grads = tf.where(tf.math.equal(p, 0))
        jtj = tf.squeeze(tf.hessians(loss_val, p)[0])
        jtr = -tf.gradients(loss_val, [p])[
            0]  # , stop_gradients=stop_grads, unconnected_gradients='zero')[0]
        jtr = tf.reshape(jtr, shape=(neurons_cnt_x1, 1))
        # jtj = hessian_multivar(loss_val, [p])

    jtj_store = tf.Variable(
        tf.zeros((neurons_cnt_x1, neurons_cnt_x1), dtype=tf.float64))
    jtr_store = tf.Variable(tf.zeros((neurons_cnt_x1, 1), dtype=tf.float64))
    save_jtj_jtr = [tf.assign(jtj_store, jtj), tf.assign(jtr_store, jtr)]

    input_mat = jtj_store + tf.multiply(mu, I_mat)
    try:
        dx = tf.matmul(tf.linalg.inv(input_mat, adjoint=None), jtr_store)
    except:
        c = tf.constant(1, dtype=tf.float64)
        input_mat += np.identity(input_mat.shape) * c
        dx = tf.matmul(tf.linalg.inv(input_mat, adjoint=None), jtr_store)
    dx = tf.squeeze(dx)
    lm = opt_obj.apply_gradients([(-dx, p)])
    # p2 = p.assign(p + dx)
    sess_values = kwargs_vals['sess']

    feed_dict[mu] = np.array([0.01], dtype=np.float64)
    i_cnt = 0
    step = 0
    mat_values = []
    sess_values.run(tf.global_variables_initializer())
    current_loss = sess_values.run(loss_val, feed_dict)

    while feed_dict[mu] > 1e-6 and step < 500:
        p0 = sess_values.run(p)
        p_0_indices = np.where(p == 0)
        p0[p_0_indices] = 0.0
        step += 1
        sess_values.run(save_params_p)
        sess_values.run(restore_params_p)
        if math.log(step, 2).is_integer():
            print('step', 'mu: ', 'current loss: ')
            print(step, feed_dict[mu][0], current_loss)
        success = False
        sess_values.run(jtj_store, feed_dict)
        sess_values.run(jtr_store, feed_dict)
        sess_values.run(save_jtj_jtr, feed_dict)
        for _ in range(400):
            # p0 equals  session object with run of p2 and feed dict
            sess_values.run(jtj_store, feed_dict)
            sess_values.run(jtr_store, feed_dict)
            sess_values.run(save_jtj_jtr, feed_dict)
            sess_values.run(lm, feed_dict)
            p0 = sess_values.run(p)
            p0[np.where(p0 == 0)] = 0
            values_vec = np.where(p0 == 0.0)
            p0[values_vec] = 0.0
            new_loss = sess_values.run(loss_val, feed_dict)
            # sess_values.run(save_jtj_jtr, feed_dict)
            if new_loss < current_loss:
                # divide parameters to 2 groups: 1 for l1 and the other for structured l2
                # shaped_new = np.int(wb_sizes_classif[0]) + np.int(wb_sizes_classif[1])
                lasso_p0 = p0[shaped_new:]
                in2_hidden_params = p0[0:shaped_new]
                # mat_values.append(lasso_p0)
                mat_values.append(p0)
                i_cnt += 1
                if len(mat_values) == 3:
                    sgn1 = mat_values[0] * mat_values[1]
                    sgn2 = mat_values[1] * mat_values[2]
                    # send the parameters to compute the values of structured penalty after
                    # checking if parameters are locally close to zero
                    px = mat_values[2]
                    osc_vec0 = np.where((sgn1 < 0.0) & (sgn2 < 0.0))
                    px[osc_vec0] = 0.0
                    # join both sets of parameter lists here
                    px0 = tf.concat([in2_hidden_params, px], 0)

                    if lambda_param2 > 0.0 and np.mod(step, 5) == 0:
                        px0 = sess_values.run(px0)
                        new_all_params, ws_bs_in1_hid1, condvec = func_compute_cond(
                            px0, lambda_param2, kwargs_vals)
                    else:
                        new_all_params = np.array(sess_values.run(px0))
                    p0 = func_collect_allparams(new_all_params,
                                                wb_sizes_classif, wb_shapes)
                    p.assign(p0)
                    mat_values = []
                    # mat_values = [px]
                else:
                    p.assign(p0)
                # sess_values.run(jtj_store, feed_dict)
                # sess_values.run(jtr_store, feed_dict)
                # sess_values.run(save_jtj_jtr, feed_dict)
                # sess_values.run(save_params_p)
                feed_dict[mu] /= 10
                current_loss = new_loss
                success = True
                break
            else:
                feed_dict[mu] *= 10
                p.assign(p0)
                # sess_values.run(save_params_p)
                sess_values.run(restore_params_p)
                # sess_values.run(save_jtj_jtr, feed_dict)
                # sess_values.run(save_params_p)
        if not success:
            print('Failed to improve')
            break

    p_new = sess_values.run(restore_params_p)
    abs_p = np.abs(p_new)
    idx_absp = np.where(abs_p < 0.01)
    p_new[idx_absp] = 0.0
    new_all_params, ws_bs_in1_hid1, condvec = func_compute_cond(
        p_new, lambda_param2, kwargs_vals)
    p_new = func_collect_allparams(p_new, wb_sizes_classif, wb_shapes)
    # p_new[osc_vec0]=0.0
    non_zero = np.count_nonzero(p_new)
    y_predict, x_inputs = func_pred_new(n_in, nn_1, p_new, **kwargs_vals)
    inw_hid1 = tf.reshape(p_new[0:shaped_new],
                          shape=(wb_shapes[0][0] + wb_shapes[1][0],
                                 wb_shapes[0][1]))
    feed_dict2 = {x_inputs: x_trained}
    print('ENDED ON STEP: ', ' FINAL LOSS:')
    print(step, current_loss)
    print('Input -> hidden layer 1 Parameters: ')
    print(sess_values.run(inw_hid1))
    # cv.close()
    y_model = sess_values.run(y_predict, feed_dict2)
    return restore_params_p, p_new, y_model, current_loss, non_zero
Beispiel #31
0
 def loop_fn(i, use_pfor):
     inp_i = array_ops.expand_dims(array_ops.gather(inp, i), 0)
     output = array_ops.reshape(model(inp_i), [-1])
     return gradients.jacobian(output,
                               variables.trainable_variables(),
                               use_pfor=use_pfor)
def func_classifier_l2l1(xtest1, ytest, kwargs1, kwargspred, **kwargs):
    hess_approx_flag = False
    initializer = kwargs1['initializer']
    mu1, _, mu_dec, max_inc = kwargs['mu'], kwargs['mu_inc'], kwargs[
        'mu_dec'], kwargs['mu_inc']
    wb_shapes, wb_sizes_classif, hidden = kwargspred['wb_shapes'], kwargspred[
        'wb_sizes'], kwargspred['hidden']
    activation, xydat, ydatrain = kwargspred['activation'], kwargspred[
        'xydat'], kwargspred['xydatrain']
    x_in, nclasses = kwargspred['xtr'], kwargs1['nclasses']
    y_labeled = kwargspred['ytr']
    nm_set_points = x_in.shape[0]
    sess, neurons_cnt_x1 = kwargspred['sess'], kwargspred['neurons_cnt']
    opt_obj = kwargspred['opt_obj']
    params0 = tf.Variable(initializer([neurons_cnt_x1], dtype=tf.float64))
    loss, x, y, y_hat_model, l2_norm_val = func_cross_entropy_loss(
        wb_sizes_classif, params0, kwargs1)
    feed_dict = {x: x_in, y: y_labeled}
    feed_dict2 = {x: xtest1, y: ytest}
    # check paper and add selected features
    # add correlation for Park data set
    # tuning parameters
    lambda_param = 0.008
    lambda_param2 = 0.4
    # l2 structured norm loss function
    mu = tf.placeholder(tf.float64, shape=[1])
    # initialized store for all parameters, gradient and H-matrix to be trained # LM parameter
    p_store = tf.Variable(tf.zeros([neurons_cnt_x1], dtype=tf.float64))
    save_params_p = tf.compat.v1.assign(p_store, params0)
    restore_params_p = tf.compat.v1.assign(params0, p_store)
    I_mat = tf.eye(neurons_cnt_x1, dtype=tf.float64)

    shaped_new = np.int(wb_sizes_classif[0]) + np.int(wb_sizes_classif[1])
    lasso_p = params0[shaped_new:]
    l2_p = params0[0:shaped_new]
    print(lasso_p)
    all_reg0 = tf.reduce_sum(tf.abs(lasso_p))
    loss_val = loss + lambda_param * all_reg0 + lambda_param2 * l2_norm_val

    if hess_approx_flag:
        # j1 equal jacobian_classif(y_hat_model, p, nm_set_points)
        # jt = tf.transpose(j1)
        # jtj = tf.matmul(jt, j1)
        # jtr = tf.matmul(jt, r)
        jcb = jacobian(y_hat_model, params0)
        t_jcb = tf.matmul(tf.transpose(jcb), jcb)
        j1 = jacobian_mse(y_hat_model, params0, nm_set_points,
                          wb_sizes_classif, wb_shapes)
        jt = tf.transpose(j1)
        partitioned = tf.dynamic_partition(j1,
                                           nm_set_points,
                                           1,
                                           name='dynamic_unstack')
        print(len(partitioned))
        l2_grad = tf.gradients(l2_norm_val, l2_p)[0]
        dxdt = tf.expand_dims(tf.gradients(all_reg0, lasso_p)[0], 1)
        hess_l2_ps = tf.hessians(l2_norm_val, l2_p)[0]
        print('The shape is;', j1.shape)
        jtj1 = tf.matmul(jt, j1)
        jtr1 = 2 * tf.matmul(jt, r1)
        l2grad = tf.expand_dims(l2_grad, 1)
        s_l2grad = tf.matmul(l2grad, tf.transpose(l2grad))
        # compute gradient of l2 params
        reshaped_gradl2 = jtr1[0:shaped_new]
        reshaped_l20 = reshaped_gradl2 + lambda_param2 * l2grad  # l2_p_grads, 1)
        # build another hessian
        jt_hess = jt[0:shaped_new] + lambda_param2 * l2grad  # l2_p_grads, 1)
        jt_hess_end = tf.concat([jt_hess, jt[shaped_new:, :]], axis=0)
        j1_t = tf.transpose(jt_hess_end)
        # calculate gradient for lasso params group
        reshaped_gradl1 = jtr1[shaped_new:]
        reshaped_gradl0 = reshaped_gradl1 + lambda_param * dxdt  # tf.expand_dims(dxdt, 1) #tf.sign(lasso_p), 1)
        # Assemble the lasso group
        jtj = tf.matmul(jt_hess_end, j1_t)
        jtr = tf.concat([reshaped_l20, reshaped_gradl0], axis=0)
        jtr = tf.reshape(jtr, shape=(neurons_cnt_x1, 1))
        # The other hess using hessian for in --> hid1
        hess_part2 = jtj1[0:shaped_new,
                          0:shaped_new] + s_l2grad  #hess_l2_ps# + h_mat_l2
        hess_partsconc = tf.concat(
            [hess_part2, jtj1[0:shaped_new, shaped_new:]], axis=1)
        jtj3 = tf.concat([hess_partsconc, jtj1[shaped_new:, :]], axis=0)
    else:
        # remove it
        # stop_grads = tf.where(tf.math.equal(p, 0))
        # jtj = hessian_multivar(loss_val, [params0])
        jtj = tf.hessians(loss_val, params0)[0]
        jtr = -tf.gradients(loss_val, params0)[
            0]  # stop_gradients=stop_grads, unconnected_gradients='zero')[0]
        jtr = tf.reshape(jtr, shape=(neurons_cnt_x1, 1))

    jtj_store = tf.Variable(
        tf.zeros((neurons_cnt_x1, neurons_cnt_x1), dtype=tf.float64))
    jtr_store = tf.Variable(tf.zeros((neurons_cnt_x1, 1), dtype=tf.float64))
    save_jtj_jtr = [tf.assign(jtj_store, jtj), tf.assign(jtr_store, jtr)]

    input_mat = jtj_store + tf.multiply(mu, I_mat)
    try:
        dx = tf.matmul(tf.linalg.inv(input_mat, adjoint=None), jtr_store)
    except:
        c = tf.constant(0.1, dtype=tf.float64)
        input_mat += np.identity(input_mat.shape) * c
        dx = tf.matmul(tf.linalg.inv(input_mat, adjoint=None), jtr_store)
    dx = tf.squeeze(dx)
    lm = opt_obj.apply_gradients([(-dx, params0)])
    # p2 equal p.assign(p + dx)
    sess_values = kwargspred['sess']
    # print(sess_values.run(lasso_p))
    feed_dict[mu] = np.array([0.1], dtype=np.float64)
    i_cnt = 0
    step = 0
    mat_values = []
    sess_values.run(tf.global_variables_initializer())
    current_loss = sess_values.run(loss_val, feed_dict)

    while feed_dict[mu] > 1e-10 and step < 200:
        p0 = sess_values.run(params0)
        values_vec = np.where(params0 == 0)
        p0[values_vec] = 0.0
        step += 1
        sess.run(save_params_p)
        # sess.run(restore_params_p)
        if math.log(step, 2).is_integer():
            print('step', 'mu: ', 'current loss: ')
            print(step, feed_dict[mu][0], current_loss)
        success = False
        sess_values.run(jtj_store, feed_dict)
        sess_values.run(p_store)
        for _ in range(400):
            sess_values.run(save_jtj_jtr, feed_dict)
            sess_values.run(jtj_store, feed_dict)
            # p0 equals  session object with run of p2 and feed dict
            sess_values.run(lm, feed_dict)
            p0 = sess_values.run(params0)
            # p0 equals tf.where(p == 0, tf.zeros_like(p), p)
            values_vec = np.where(p0 == 0.0)
            p0[values_vec] = 0.0
            new_loss = sess_values.run(loss_val, feed_dict)
            if new_loss < current_loss:
                # divide parameters to 2 groups: 1 for l1 and the other for structured l2
                shaped_new = np.int(wb_sizes_classif[0]) + np.int(
                    wb_sizes_classif[1])
                lasso_p0 = p0[shaped_new:]
                in2_hidden_params = p0[0:shaped_new]
                mat_values.append(lasso_p0)
                i_cnt += 1
                if len(mat_values) == 3:
                    sgn1 = mat_values[0] * mat_values[1]
                    sgn2 = mat_values[1] * mat_values[2]  # store parameters
                    # checking if parameters are locally close to zero
                    px = mat_values[2]
                    values_vec = np.where((sgn1 < 0) & (sgn2 < 0))
                    px[values_vec] = 0.0
                    print(len(mat_values))
                    # join both sets of parameter lists here joined_params = np.concatenate(l2_params_set, new_p0)
                    px0 = tf.concat([in2_hidden_params, px], 0)
                    if lambda_param2 > 0.0 and np.mod(step, 2) == 0:
                        px0 = sess_values.run(px0)
                        new_all_params, ws_bs_in1_hid1, _ = func_compute_cond(
                            px0, lambda_param2, kwargspred)
                    else:
                        new_all_params = np.array(sess_values.run(px0))
                    # sess_px0 equal np.array(sess_values.run(px0))
                    p_values_send = func_collect_allparams(
                        new_all_params, wb_sizes_classif, wb_shapes)
                    print(p_values_send.shape)
                    params0.assign(p_values_send)
                    i_cnt = 0
                    mat_values = []
                    mat_values = [px]
                else:
                    params0.assign(p0)
                feed_dict[mu] /= 10
                current_loss = new_loss
                success = True
                break
            else:
                feed_dict[mu] *= 10
                params0.assign(p0)
                # sess.run(save_params_p)
                sess_values.run(restore_params_p)
        if not success:
            print('Failed to improve')
            break

    p_new = sess_values.run(restore_params_p)
    abs_p = np.abs(p_new)
    idx_absp = np.where(abs_p < 0.1)
    p_new[idx_absp] = 0.0
    p_new[values_vec] = 0.0
    correct_prediction, feed_dict2, y_hat_classif_logits = predclassif(
        wb_sizes_classif, xydat, hidden, p_new, activation, wb_shapes,
        nclasses)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print('ENDED ON STEP: ')
    print(step)
    print(' FINAL LOSS:')
    print(current_loss)
    print('Parameters: ')
    print(sess_values.run(restore_params_p))
    print('Parameters: ')
    print(p_new)
    print("Accuracy:", sess.run(accuracy, feed_dict2))
    correct_predictions = sess.run(y_hat_classif_logits, feed_dict2)
    correct_prediction, feed_dict21, y_hat_classif_logits = predclassif(
        wb_sizes_classif, ydatrain, hidden, p_new, activation, wb_shapes,
        nclasses)
    correct_predictions_train = sess.run(y_hat_classif_logits, feed_dict21)
    return p_new, correct_predictions, correct_predictions_train
Beispiel #33
0
 def loop_fn(i, use_pfor):
     image = array_ops.gather(images, i)
     logits = array_ops.reshape(model(image, training=training), [-1])
     return gradients.jacobian(logits,
                               variables.trainable_variables(),
                               use_pfor=use_pfor)
Beispiel #34
0
def empirical_NTK(model, train_images):
    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    print(rank)

    model.compile("sgd", loss=lambda target, pred: pred)
    import tensorflow.keras.backend as K
    num_layers = len(model.trainable_weights)
    trainable_weights = np.array(model.trainable_weights)

    # fs = []
    # params_per_chunk = []

    num_chunks = min(size, num_layers)
    layers_per_chunk = num_layers // num_chunks
    if rank < num_chunks:
        chunks = list(
            range(int(rank * layers_per_chunk),
                  int((rank + 1) * layers_per_chunk)))

        if rank < num_layers % num_chunks:
            chunks.append(num_chunks * layers_per_chunk + rank)
        params_per_layer = np.array(
            [np.prod(x.shape) for x in trainable_weights])
        params_per_chunk = sum(params_per_layer[chunks])
        # grads = model.optimizer.get_gradients(model.output, list(trainable_weights[chunks]))
        # grads = tf.keras.backend.gradients(model.output, list(trainable_weights[chunks]))
        # grads = tf.gradients(model.output, list(trainable_weights[chunks]))
        # symb_inputs = (model._feed_inputs + model._feed_targets)
        symb_inputs = model._feed_inputs
        grads = jacobian(model.output, list(trainable_weights[chunks]))
        f = K.function(symb_inputs, grads)

        def get_weight_grad(model, inputs, outputs):
            """ Gets gradient of model for given inputs and outputs for all weights"""
            x, y, _ = model._standardize_user_data(inputs, outputs)
            batch_size = inputs.shape[0]
            # output_grad = f(x + y)
            output_grad = f(x)
            print(output_grad[0].shape)
            # output_grad = np.concatenate([x.flatten() for x in output_grad])
            output_grad = np.concatenate(
                [x.reshape((batch_size, -1)) for x in output_grad])
            return output_grad

        X = train_images
        m = len(X)
        Y = np.zeros((len(X), 1))
        NTK = np.zeros((len(X), len(X)))
        chunk1 = 25
        chunk2 = chunk1
        # it's benefitial to chunk in j2 too, in orden to reduce the python for loop. Even though we do more on numpy/pytorch (by reducing the chunking on j1, we do more grad computaiotns), python is much slower than those, and so tradeoff is worth it I think
        # print("tot_parameters",tot_parameters)
        # jac1 = np.zeros((chunk1,params_per_chunk))
        # jac2 = np.zeros((chunk2,params_per_chunk))
        num_chunk1s = m // chunk1
        if m % chunk1 > 0:
            num_chunk1s += 1
        num_chunk2s = m // chunk2
        if m % chunk2 > 0:
            num_chunk2s += 1
        for j1 in range(num_chunk1s):
            if m % chunk1 > 0 and j1 == num_chunk1s - 1:
                num_inputs1 = m % chunk1
                # jac1 = np.zeros((num_inputs1,params_per_chunk))
            else:
                num_inputs1 = chunk1
            print("chunk", j1, "out of", num_chunk1s)
            sys.stdout.flush()
            # for i in range(num_inputs1):
            #     gradient = get_weight_grad(model, train_images[j1*chunk1+i:j1*chunk1+i+1], Y[j1*chunk1+i:j1*chunk1+i+1])
            #     jac1[i,:] = gradient
            jac1 = get_weight_grad(
                model, train_images[j1 * chunk1:j1 * chunk1 + num_inputs1],
                Y[j1 * chunk1:j1 * chunk1 + num_inputs1])
            print(jac1.shape)
            for j2 in range(j1, num_chunk2s):
                if m % chunk2 > 0 and j2 == num_chunk2s - 1:
                    num_inputs2 = m % chunk2
                    # jac2 = np.zeros((num_inputs2,params_per_chunk))
                else:
                    num_inputs2 = chunk2
                print(j1, j2)
                # for i in range(num_inputs2):
                #     gradient = get_weight_grad(model, train_images[j2*chunk2+i:j2*chunk2+i+1], Y[j2*chunk2+i:j2*chunk2+i+1])
                #     jac2[i,:] = gradient
                jac2 = get_weight_grad(
                    model, train_images[j2 * chunk2:j2 * chunk2 + num_inputs2],
                    Y[j2 * chunk2:j2 * chunk2 + num_inputs2])
                NTK[j1 * chunk1:j1 * chunk1 + num_inputs1,
                    j2 * chunk2:j2 * chunk2 + num_inputs2] += np.matmul(
                        jac1, jac2.T)

    ntk_recv = None
    if rank == 0:
        ntk_recv = np.zeros_like(NTK)
    comm.Reduce(NTK, ntk_recv, op=MPI.SUM, root=0)
    if rank == 0:
        NTK = (ntk_recv + ntk_recv.T) / 2
        return NTK
Beispiel #35
0
def getGrad2(image):
    with tf.GradientTape(persistent=True) as tape:
        x, concepts, _ = model.helper(image)
    gradients = jacobian(concepts, x)
    return gradients
 def loop_fn(i, use_pfor):
   image = array_ops.gather(images, i)
   logits = array_ops.reshape(model(image, training=training), [-1])
   return gradients.jacobian(
       logits, variables.trainable_variables(), use_pfor=use_pfor)
 def loop_fn(i, use_pfor):
   inp_i = array_ops.expand_dims(array_ops.gather(inp, i), 0)
   output = array_ops.reshape(model(inp_i), [-1])
   return gradients.jacobian(
       output, variables.trainable_variables(), use_pfor=use_pfor)