def random_tril_matrix(
    shape, dtype, force_well_conditioned=False, remove_upper=True):
  """[batch] lower triangular matrix.

  Args:
    shape:  `TensorShape` or Python `list`.  Shape of the returned matrix.
    dtype:  `TensorFlow` `dtype` or Python dtype
    force_well_conditioned:  Python `bool`. If `True`, returned matrix will have
      eigenvalues with modulus in `(1, 2)`.  Otherwise, eigenvalues are unit
      normal random variables.
    remove_upper:  Python `bool`.
      If `True`, zero out the strictly upper triangle.
      If `False`, the lower triangle of returned matrix will have desired
      properties, but will not not have the strictly upper triangle zero'd out.

  Returns:
    `Tensor` with desired shape and dtype.
  """
  with tf.name_scope("random_tril_matrix"):
    # Totally random matrix.  Has no nice properties.
    tril = random_normal(shape, dtype=dtype)
    if remove_upper:
      tril = tf.matrix_band_part(tril, -1, 0)

    # Create a diagonal with entries having modulus in [1, 2].
    if force_well_conditioned:
      maxval = tf.convert_to_tensor(np.sqrt(2.), dtype=dtype.real_dtype)
      diag = random_sign_uniform(
          shape[:-1], dtype=dtype, minval=1., maxval=maxval)
      tril = tf.matrix_set_diag(tril, diag)

    return tril
Example #2
0
def gauss_kl(q_mu, q_sqrt, K):
    """
    Compute the KL divergence from

          q(x) = N(q_mu, q_sqrt^2)
    to
          p(x) = N(0, K)

    We assume multiple independent distributions, given by the columns of
    q_mu and the last dimension of q_sqrt.

    q_mu is a matrix, each column contains a mean.

    q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root
        matrix of the covariance of q.

    K is a positive definite matrix: the covariance of p.
    """
    L = tf.cholesky(K)
    alpha = tf.matrix_triangular_solve(L, q_mu, lower=True)
    KL = 0.5 * tf.reduce_sum(tf.square(alpha))  # Mahalanobis term.
    num_latent = tf.cast(tf.shape(q_sqrt)[2], float_type)
    KL += num_latent * 0.5 * tf.reduce_sum(tf.log(tf.square(tf.diag_part(L))))  # Prior log-det term.
    KL += -0.5 * tf.cast(tf.reduce_prod(tf.shape(q_sqrt)[1:]), float_type)  # constant term
    Lq = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0)  # force lower triangle
    KL += -0.5*tf.reduce_sum(tf.log(tf.square(tf.matrix_diag_part(Lq))))  # logdet
    L_tiled = tf.tile(tf.expand_dims(L, 0), tf.pack([tf.shape(Lq)[0], 1, 1]))
    LiLq = tf.matrix_triangular_solve(L_tiled, Lq, lower=True)
    KL += 0.5 * tf.reduce_sum(tf.square(LiLq))  # Trace term
    return KL
 def _random_cholesky_array(self, shape):
   mat = self._rng.rand(*shape)
   chol = distribution_util.matrix_diag_transform(
       mat, transform=tf.nn.softplus)
   # Zero the upper triangle because we're using this as a true Cholesky factor
   # in our tests.
   return tf.matrix_band_part(chol, -1, 0).eval()
  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
    shape = list(shape)
    diag_shape = shape[:-1]

    # Upper triangle will be ignored.
    # Use a diagonal that ensures this matrix is well conditioned.
    tril = tf.random_normal(shape=shape, dtype=dtype.real_dtype)
    diag = tf.random_uniform(
        shape=diag_shape, dtype=dtype.real_dtype, minval=2., maxval=3.)
    if dtype.is_complex:
      tril = tf.complex(
          tril, tf.random_normal(shape, dtype=dtype.real_dtype))
      diag = tf.complex(
          diag, tf.random_uniform(
              shape=diag_shape, dtype=dtype.real_dtype, minval=2., maxval=3.))

    tril = tf.matrix_set_diag(tril, diag)

    tril_ph = tf.placeholder(dtype=dtype)

    if use_placeholder:
      # Evaluate the tril here because (i) you cannot feed a tensor, and (ii)
      # tril is random and we want the same value used for both mat and
      # feed_dict.
      tril = tril.eval()
      operator = linalg.LinearOperatorTriL(tril_ph)
      feed_dict = {tril_ph: tril}
    else:
      operator = linalg.LinearOperatorTriL(tril)
      feed_dict = None

    mat = tf.matrix_band_part(tril, -1, 0)

    return operator, mat, feed_dict
    def call(self, x, mask=None):
        x1 ,x2 = x
        outer = tf.matmul(tf.expand_dims(x1, axis=2), tf.expand_dims(x2, axis=1))
        outer = tf.matrix_band_part(outer, 0, self.ans_limit)
        output1 = tf.reshape(tf.cast(tf.argmax(tf.reduce_max(outer, axis=2), axis=1), tf.float32),(-1,1))
        output2 = tf.reshape(tf.cast(tf.argmax(tf.reduce_max(outer, axis=1), axis=1), tf.float32),(-1,1))

        return [output1, output2]
Example #6
0
  def _sample_n(self, n, seed):
    batch_shape = self.batch_shape_tensor()
    event_shape = self.event_shape_tensor()
    batch_ndims = tf.shape(batch_shape)[0]

    ndims = batch_ndims + 3  # sample_ndims=1, event_ndims=2
    shape = tf.concat([[n], batch_shape, event_shape], 0)
    stream = seed_stream.SeedStream(seed, salt="Wishart")

    # Complexity: O(nbk**2)
    x = tf.random_normal(
        shape=shape, mean=0., stddev=1., dtype=self.dtype, seed=stream())

    # Complexity: O(nbk)
    # This parametrization is equivalent to Chi2, i.e.,
    # ChiSquared(k) == Gamma(alpha=k/2, beta=1/2)
    expanded_df = self.df * tf.ones(
        self.scale_operator.batch_shape_tensor(),
        dtype=self.df.dtype.base_dtype)

    g = tf.random_gamma(
        shape=[n],
        alpha=self._multi_gamma_sequence(0.5 * expanded_df, self.dimension),
        beta=0.5,
        dtype=self.dtype,
        seed=stream())

    # Complexity: O(nbk**2)
    x = tf.matrix_band_part(x, -1, 0)  # Tri-lower.

    # Complexity: O(nbk)
    x = tf.matrix_set_diag(x, tf.sqrt(g))

    # Make batch-op ready.
    # Complexity: O(nbk**2)
    perm = tf.concat([tf.range(1, ndims), [0]], 0)
    x = tf.transpose(x, perm)
    shape = tf.concat([batch_shape, [event_shape[0]], [-1]], 0)
    x = tf.reshape(x, shape)

    # Complexity: O(nbM) where M is the complexity of the operator solving a
    # vector system. For LinearOperatorLowerTriangular, each matmul is O(k^3) so
    # this step has complexity O(nbk^3).
    x = self.scale_operator.matmul(x)

    # Undo make batch-op ready.
    # Complexity: O(nbk**2)
    shape = tf.concat([batch_shape, event_shape, [n]], 0)
    x = tf.reshape(x, shape)
    perm = tf.concat([[ndims - 1], tf.range(0, ndims - 1)], 0)
    x = tf.transpose(x, perm)

    if not self.input_output_cholesky:
      # Complexity: O(nbk**3)
      x = tf.matmul(x, x, adjoint_b=True)

    return x
 def _forward(self, x):
   if self.validate_args:
     is_matrix = tf.assert_rank_at_least(x, 2)
     shape = tf.shape(x)
     is_square = tf.assert_equal(shape[-2], shape[-1])
     x = control_flow_ops.with_dependencies([is_matrix, is_square], x)
   # For safety, explicitly zero-out the upper triangular part.
   x = tf.matrix_band_part(x, -1, 0)
   return tf.matmul(x, x, adjoint_b=True)
Example #8
0
 def CheckUnitary(self, x):
   # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
   xx = tf.matmul(x, x, adjoint_a=True)
   identity = tf.matrix_band_part(tf.ones_like(xx), 0, 0)
   if is_single:
     tol = 1e-5
   else:
     tol = 1e-14
   self.assertAllClose(identity.eval(), xx.eval(), atol=tol)
 def Test(self):
   shape = batch_shape_ + shape_
   x = tf.constant(np.random.rand(*shape), dtype=dtype_)
   with self.test_session(use_gpu=True):
     for lower in -1, 0, 1, shape_[-2] - 1:
       for upper in -1, 0, 1, shape_[-1] - 1:
         y = tf.matrix_band_part(x, lower, upper)
         error = tf.test.compute_gradient_error(x, x.get_shape().as_list(), y,
                                                y.get_shape().as_list())
         self.assertLess(error, 1e-4)
Example #10
0
def get_right_context_mask(time_steps):
    """ Generates the mask preventing the decoder from attending to unseen positions. """
    # Generate mask that limits decoder self-attention up to and including the current position
    attn_mask = tf.matrix_band_part(tf.ones([time_steps, time_steps]), -1, 0)
    # Expand mask to 4d. so as to be compatible with attention weights
    attn_mask = tf.expand_dims(tf.expand_dims(attn_mask, 0), 0)
    # Illegal connections will be set to -inf when fed into the softmax function
    # Padding for non-masked positions is applied to prevent NaNs
    attn_mask = -1e9 * (1.0 - attn_mask)
    return attn_mask
def mask_leq(target_length, source_length):
  """A mask with 1.0 wherever source_pos <= target_pos and 0.0 elsewhere.

  Args:
    target_length: an integer
    source_length: an integer
  Returns:
    a Tensor with shape [1, target_length, source_length]
  """
  return tf.expand_dims(
      tf.matrix_band_part(tf.ones([target_length, source_length]), -1, 0), 0)
Example #12
0
def attention_bias_lower_triangle(length):
    """ Create a bias tensor to be added to attention logits.

      Allows a query to attend to all positions up to and including its own.
    Args:
        length: A scalar.

    Returns: A float Tensor of shape [1, 1, length, length], with -1e9 in
      padding positions and 0 in non-padding positions.

    """
    lower_triangle = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
    ret = FLOAT_MIN * (1. - lower_triangle)
    return tf.reshape(ret, [1, 1, length, length])
 def Test(self):
   mat = np.ones(shape_).astype(dtype_)
   batch_mat = np.tile(mat, batch_shape + (1, 1))
   with self.test_session(use_gpu=True):
     for lower in -1, 0, 1, shape_[-2] - 1:
       for upper in -1, 0, 1, shape_[-1] - 1:
         band_np = mat
         if lower >= 0:
           band_np = np.triu(band_np, -lower)
         if upper >= 0:
           band_np = np.tril(band_np, upper)
         if batch_shape is not ():
           band_np = np.tile(band_np, batch_shape + (1, 1))
         band = tf.matrix_band_part(batch_mat, lower, upper)
         self.assertAllEqual(band_np, band.eval())
Example #14
0
def get_decoder_self_attention_bias(length):
  """Calculate bias for decoder that maintains model's autoregressive property.

  Creates a tensor that masks out locations that correspond to illegal
  connections, so prediction at position i cannot draw information from future
  positions.

  Args:
    length: int length of sequences in batch.

  Returns:
    float tensor of shape [1, 1, length, length]
  """
  with tf.name_scope("decoder_self_attention_bias"):
    valid_locs = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
    valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
    decoder_bias = _NEG_INF * (1.0 - valid_locs)
  return decoder_bias
 def _assertions(self, x):
   if not self.validate_args:
     return []
   x_shape = tf.shape(x)
   is_matrix = tf.assert_rank_at_least(
       x, 2,
       message="Input must have rank at least 2.")
   is_square = tf.assert_equal(
       x_shape[-2], x_shape[-1],
       message="Input must be a square matrix.")
   diag_part_x = tf.matrix_diag_part(x)
   is_lower_triangular = tf.assert_equal(
       tf.matrix_band_part(x, 0, -1),  # Preserves triu, zeros rest.
       tf.matrix_diag(diag_part_x),
       message="Input must be lower triangular.")
   is_positive_diag = tf.assert_positive(
       diag_part_x,
       message="Input must have all positive diagonal entries.")
   return [is_matrix, is_square, is_lower_triangular, is_positive_diag]
  def testNonDefaultsYieldCorrectShapesAndValues(self):
    batch_shape = [4, 3]
    x_size = 3
    mvn_size = 5
    x_ = np.random.randn(*np.concatenate([batch_shape, [x_size]]))

    x = tf.constant(x_)
    mvn = tfp.trainable_distributions.multivariate_normal_tril(
        x,
        dims=mvn_size,
        loc_fn=tf.zeros_like,
        scale_fn=lambda x: tfd.fill_triangular(tf.ones_like(x)))
    scale = mvn.scale.to_dense()
    expected_scale = tf.matrix_band_part(
        tf.ones(np.concatenate([batch_shape, [mvn_size, mvn_size]]),
                scale.dtype),
        num_lower=-1,
        num_upper=0)

    self.evaluate(tf.global_variables_initializer())
    [
        batch_shape_,
        event_shape_,
        loc_,
        scale_,
        expected_scale_,
    ] = self.evaluate([
        mvn.batch_shape_tensor(),
        mvn.event_shape_tensor(),
        mvn.loc,
        scale,
        expected_scale,
    ])

    self.assertAllEqual(batch_shape, mvn.batch_shape)
    self.assertAllEqual(batch_shape, batch_shape_)

    self.assertAllEqual([mvn_size], mvn.event_shape)
    self.assertAllEqual([mvn_size], event_shape_)

    self.assertAllEqual(np.zeros_like(loc_), loc_)
    self.assertAllEqual(expected_scale_, scale_)
Example #17
0
def get_multivariate_gaussian_energy_fn(x_dim=2):
  """Get energy function for 2d strongly correlated Gaussian."""

  mu = tf.random_normal(shape=[x_dim])
  # Lower triangularize and positive diagonal
  l = tf.sigmoid(
      tf.matrix_band_part(tf.random_normal(shape=[x_dim, x_dim]), -1, 0))
  # Exploit Cholesky decomposition
  sigma = tf.matmul(l, tf.transpose(l))
  sigma *= 100.  # Small covariance causes extreme numerical instability
  sigma_inv = tf.matrix_inverse(sigma)

  def energy(x):
    """Unnormalized log density/energy of 2d strongly correlated Gaussian."""

    xmmu = x - mu
    return .5 * tf.diag_part(
        tf.matmul(tf.matmul(xmmu, sigma_inv), tf.transpose(xmmu)))

  return energy
Example #18
0
def base_conditional(Kmn, Kmm, Knn, f, *, full_cov=False, q_sqrt=None, white=False):
    # compute kernel stuff
    num_func = tf.shape(f)[1]  # K
    Lm = tf.cholesky(Kmm)

    # Compute the projection matrix A
    A = tf.matrix_triangular_solve(Lm, Kmn, lower=True)

    # compute the covariance due to the conditioning
    if full_cov:
        fvar = Knn - tf.matmul(A, A, transpose_a=True)
        shape = tf.stack([num_func, 1, 1])
    else:
        fvar = Knn - tf.reduce_sum(tf.square(A), 0)
        shape = tf.stack([num_func, 1])
    fvar = tf.tile(tf.expand_dims(fvar, 0), shape)  # K x N x N or K x N

    # another backsubstitution in the unwhitened case
    if not white:
        A = tf.matrix_triangular_solve(tf.transpose(Lm), A, lower=False)

    # construct the conditional mean
    fmean = tf.matmul(A, f, transpose_a=True)

    if q_sqrt is not None:
        if q_sqrt.get_shape().ndims == 2:
            LTA = A * tf.expand_dims(tf.transpose(q_sqrt), 2)  # K x M x N
        elif q_sqrt.get_shape().ndims == 3:
            L = tf.matrix_band_part(q_sqrt, -1, 0)  # K x M x M
            A_tiled = tf.tile(tf.expand_dims(A, 0), tf.stack([num_func, 1, 1]))
            LTA = tf.matmul(L, A_tiled, transpose_a=True)  # K x M x N
        else:  # pragma: no cover
            raise ValueError("Bad dimension for q_sqrt: %s" %
                             str(q_sqrt.get_shape().ndims))
        if full_cov:
            fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True)  # K x N x N
        else:
            fvar = fvar + tf.reduce_sum(tf.square(LTA), 1)  # K x N
    fvar = tf.transpose(fvar)  # N x K or N x N x K

    return fmean, fvar
  def _operator_and_mat_and_feed_dict(self, shape, dtype, use_placeholder):
    # Upper triangle will be nonzero, but ignored.
    # Use a diagonal that ensures this matrix is well conditioned.
    tril = linear_operator_test_util.random_tril_matrix(
        shape, dtype=dtype, force_well_conditioned=True, remove_upper=False)

    if use_placeholder:
      tril_ph = tf.placeholder(dtype=dtype)
      # Evaluate the tril here because (i) you cannot feed a tensor, and (ii)
      # tril is random and we want the same value used for both mat and
      # feed_dict.
      tril = tril.eval()
      operator = linalg.LinearOperatorTriL(tril_ph)
      feed_dict = {tril_ph: tril}
    else:
      operator = linalg.LinearOperatorTriL(tril)
      feed_dict = None

    mat = tf.matrix_band_part(tril, -1, 0)

    return operator, mat, feed_dict
Example #20
0
def gauss_kl_white(q_mu, q_sqrt):
    """
    Compute the KL divergence from

          q(x) = N(q_mu, q_sqrt^2)
    to
          p(x) = N(0, I)

    We assume multiple independent distributions, given by the columns of
    q_mu and the last dimension of q_sqrt.

    q_mu is a matrix, each column contains a mean

    q_sqrt is a 3D tensor, each matrix within is a lower triangular square-root
        matrix of the covariance.
    """
    KL = 0.5 * tf.reduce_sum(tf.square(q_mu))  # Mahalanobis term
    KL += -0.5 * tf.cast(tf.reduce_prod(tf.shape(q_sqrt)[1:]), float_type)  # constant term
    L = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0)  # force lower triangle
    KL -= 0.5 * tf.reduce_sum(tf.log(tf.square(tf.matrix_diag_part(L))))  # logdet
    KL += 0.5 * tf.reduce_sum(tf.square(L))  # Trace term.
    return KL
Example #21
0
def mask_future(energies: tf.Tensor, mask_value=-1e9) -> tf.Tensor:
    """Mask energies of keys using lower triangular matrix.

    Mask simulates autoregressive decoding, such that it prevents
    the attention to look at what has not yet been decoded.
    Mask is not necessary during training when true output values
    are used instead of the decoded ones.

    Arguments:
        energies: A tensor to mask.
        mask_value: Value used to mask energies.

    Returns:
        Masked energies tensor.
    """
    triangular_mask = tf.matrix_band_part(tf.ones_like(energies), -1, 0)
    mask_area = tf.equal(triangular_mask, 1)

    # Note that for compatibility with tensor2tensor, we use -1e9 for negative
    # infinity.
    masked_value = tf.fill(tf.shape(energies), mask_value)
    return tf.where(mask_area, energies, masked_value)
 def _assertions(self, x):
   if not self.validate_args:
     return []
   shape = tf.shape(x)
   is_matrix = tf.assert_rank_at_least(
       x, 2, message="Input must have rank at least 2.")
   is_square = tf.assert_equal(
       shape[-2], shape[-1], message="Input must be a square matrix.")
   above_diagonal = tf.matrix_band_part(
       tf.matrix_set_diag(x, tf.zeros(shape[:-1], dtype=tf.float32)), 0, -1)
   is_lower_triangular = tf.assert_equal(
       above_diagonal,
       tf.zeros_like(above_diagonal),
       message="Input must be lower triangular.")
   # A lower triangular matrix is nonsingular iff all its diagonal entries are
   # nonzero.
   diag_part = tf.matrix_diag_part(x)
   is_nonsingular = tf.assert_none_equal(
       diag_part,
       tf.zeros_like(diag_part),
       message="Input must have all diagonal entries nonzero.")
   return [is_matrix, is_square, is_lower_triangular, is_nonsingular]
Example #23
0
    def _build_likelihood(self):
        """
        This method computes the variational lower bound on the likelihood,
        which is:

            E_{q(F)} [ \log p(Y|F) ] - KL[ q(F) || p(F)]

        with

            q(\\mathbf f) = N(\\mathbf f \\,|\\, \\boldsymbol \\mu, \\boldsymbol \\Sigma)

        """

        # Get prior KL.
        KL = gauss_kl(self.q_mu, self.q_sqrt)

        # Get conditionals
        K = self.kern.K(self.X) + tf.eye(self.num_data, dtype=settings.float_type) * \
            settings.numerics.jitter_level
        L = tf.cholesky(K)

        fmean = tf.matmul(L, self.q_mu) + self.mean_function(self.X)  # NN,ND->ND

        q_sqrt_dnn = tf.matrix_band_part(self.q_sqrt, -1, 0)  # D x N x N

        L_tiled = tf.tile(tf.expand_dims(L, 0), tf.stack([self.num_latent, 1, 1]))

        LTA = tf.matmul(L_tiled, q_sqrt_dnn)  # D x N x N
        fvar = tf.reduce_sum(tf.square(LTA), 2)

        fvar = tf.transpose(fvar)

        # Get variational expectations.
        var_exp = self.likelihood.variational_expectations(fmean, fvar, self.Y)

        return tf.reduce_sum(var_exp) - KL
  def testDefaultsYieldCorrectShapesAndValues(self):
    batch_shape = [4, 3]
    x_size = 3
    mvn_size = 5
    x_ = np.random.randn(*np.concatenate([batch_shape, [x_size]]))

    x = tf.constant(x_)
    mvn = tfp.trainable_distributions.multivariate_normal_tril(x, dims=mvn_size)
    scale = mvn.scale.to_dense()
    scale_upper = tf.matrix_set_diag(
        tf.matrix_band_part(scale, num_lower=0, num_upper=-1),
        tf.zeros(np.concatenate([batch_shape, [mvn_size]]), scale.dtype))
    scale_diag = tf.matrix_diag_part(scale)

    self.evaluate(tf.global_variables_initializer())
    [
        batch_shape_,
        event_shape_,
        scale_diag_,
        scale_upper_,
    ] = self.evaluate([
        mvn.batch_shape_tensor(),
        mvn.event_shape_tensor(),
        scale_diag,
        scale_upper,
    ])

    self.assertAllEqual(batch_shape, mvn.batch_shape)
    self.assertAllEqual(batch_shape, batch_shape_)

    self.assertAllEqual([mvn_size], mvn.event_shape)
    self.assertAllEqual([mvn_size], event_shape_)

    self.assertAllEqual(np.ones_like(scale_diag_, dtype=np.bool),
                        scale_diag_ > 0.)
    self.assertAllEqual(np.zeros_like(scale_upper_), scale_upper_)
Example #25
0
    def __init__(self, placeholders, input_dim, attack=None, **kwargs):
        super(GCN, self).__init__(**kwargs)
        print('attack method:',attack)
        # if attack is False, placeholders['support'] feeds in normalized pre-processed adjacent matrix, 
        # if attack is True, placeholders['adj'] feeds in raw adjacent matrix and placeholdder['s'] feeds in attack placeholders
        self.inputs = placeholders['features']
        self.input_dim = input_dim
        # self.input_dim = self.inputs.get_shape().as_list()[1]  # To be supported in future Tensorflow versions
        self.output_dim = placeholders['labels'].get_shape().as_list()[1]
        self.placeholders = placeholders
        lmd = placeholders['lmd']
        self.attack = attack 

        if self.attack:
            mu = placeholders['mu']
            
            # the length of A list, in fact, self.num_support is always 1
            self.num_supports = len(placeholders['adj'])
            # original adjacent matrix A
            self.A = placeholders['adj']
            self.mask = [tf.constant(np.triu(np.ones([self.A[0].get_shape()[0].value]*2, dtype = np.float32),1))]
             
            self.C = [1 - 2 * self.A[i] - tf.eye(self.A[i].get_shape().as_list()[0], self.A[i].get_shape().as_list()[1]) for i in range(self.num_supports)] 
            # placeholder for adding edges
            self.upper_S_0 = placeholders['s'] 
            # a strict upper triangular matrix to ensure only N(N-1)/2 trainable variables
            # here use matrix_band_part to ensure a stricly upper triangular matrix     
            self.upper_S_real = [tf.matrix_band_part(self.upper_S_0[i],0,-1)-tf.matrix_band_part(self.upper_S_0[i],0,0) for i in range(self.num_supports)] 
            # modified_A is the new adjacent matrix
            self.upper_S_real2 = [self.upper_S_real[i] + tf.transpose(self.upper_S_real[i]) for i in range(self.num_supports)]
            self.modified_A = [self.A[i] + tf.multiply(self.upper_S_real2[i], self.C[i]) for i in range(self.num_supports)]
            """Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""   
            self.hat_A = [tf.cast(self.modified_A[i] + tf.eye(self.modified_A[i].get_shape().as_list()[0], self.modified_A[i].get_shape().as_list()[1]),dtype='float32') for i in range(self.num_supports)] 
            
            # get degree by row sum
            self.rowsum = tf.reduce_sum(self.hat_A[0],axis=1) 
            self.d_sqrt = tf.sqrt(self.rowsum) # square root
            self.d_sqrt_inv = tf.math.reciprocal(self.d_sqrt) # reciprocal
            
            self.support_real = tf.multiply(tf.transpose(tf.multiply(self.hat_A[0],self.d_sqrt_inv)),self.d_sqrt_inv)
            # this self.support is a list of \tilde{A} in the paper
            # replace the 'support' in the placeholders dictionary
            self.placeholders['support'] = [self.support_real] 
            
            self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
            self.build()
            
            
            
            # proximal gradient algorithm
            if self.attack == 'PGD':
                self.Sgrad = tf.gradients(self.attack_loss, self.upper_S_real[0])
                self.a = self.upper_S_real[0] + mu * self.Sgrad * lmd * self.mask
            elif self.attack == 'CW':
                label = placeholders['labels'] 
                real = tf.reduce_sum(label * self.outputs,1)
                label_mask_expand = placeholders['label_mask_expand']
                other = tf.reduce_max((1 - label) * label_mask_expand * self.outputs - label * 10000,1)
                self.loss1 = tf.maximum(0.0, (real-other+50)*label_mask_expand[:,0])
                self.loss2 = tf.reduce_sum(self.loss1) 
                self.Sgrad = tf.gradients(self.loss2, self.upper_S_real[0])
                self.a = self.upper_S_real[0] - mu * self.Sgrad * lmd * self.mask
            elif self.attack == 'minmax':
                self.w = placeholders['w']
                label = placeholders['labels'] 
                self.real = tf.reduce_sum(label * self.outputs,1)
                label_mask_expand = placeholders['label_mask_expand']
                self.other = tf.reduce_max((1 - label) * label_mask_expand * self.outputs - label * 10000,1)
                self.loss1 = self.w * tf.maximum(0.0, self.real-self.other+0.)
                self.loss2 = tf.reduce_sum(self.loss1) 
                self.Sgrad = tf.gradients(self.loss2, self.upper_S_real[0])
                self.a = self.upper_S_real[0] - mu * self.Sgrad * self.mask
            else:
                raise NotImplementedError
            
            
        else:
            self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
            self.build()
Example #26
0
    def build_model(self):
        # add place holder
        self.contexts = tf.placeholder(shape=[None, None],
                                       dtype=tf.int32,
                                       name="context")
        self.context_legnths = tf.placeholder(shape=[None],
                                              dtype=tf.int32,
                                              name="c_length")
        self.questions = tf.placeholder(shape=[None, None],
                                        dtype=tf.int32,
                                        name="q")
        self.question_legnths = tf.placeholder(shape=[None],
                                               dtype=tf.int32,
                                               name="q_len")
        # [batch, num_sentences, num_words]
        self.sentences = tf.placeholder(shape=[None, None, None],
                                        dtype=tf.int32,
                                        name="sentences")
        # [num_sentences, num_words]
        self.sequence_lengths = tf.placeholder(shape=[None, None],
                                               dtype=tf.int32,
                                               name="seq_len")
        # [num_sentences]
        self.sentence_lengths = tf.placeholder(shape=[None],
                                               dtype=tf.int32,
                                               name="sent_len")
        self.sentence_idx = tf.placeholder(shape=[None],
                                           dtype=tf.int32,
                                           name="sent_idx")
        self.answerable = tf.placeholder(shape=[None],
                                         dtype=tf.int32,
                                         name="answ")
        self.answer_span = tf.placeholder(shape=[None, 2],
                                          dtype=tf.int32,
                                          name="answer_span")
        self.dropout = tf.placeholder(dtype=tf.float32, name="dropout")

        self.avg_loss = tf.placeholder(dtype=tf.float32, name="avg_loss")
        self.avg_em = tf.placeholder(dtype=tf.float32, name="avg_em")
        self.avg_acc = tf.placeholder(dtype=tf.float32, name="avg_acc")
        loss_summary = tf.summary.scalar("loss", self.avg_em)
        acc_summary = tf.summary.scalar("accuracy", self.avg_acc)
        em_summary = tf.summary.scalar("em", self.avg_em)
        self.merged = tf.summary.merge([loss_summary, acc_summary, em_summary])

        self.document_size, self.sentence_size, self.word_size = tf.unstack(
            tf.shape(self.sentences))
        # add embeddings
        zeros = tf.constant([[0.0] * self.config.embedding_size])
        unk_dummy = tf.get_variable(shape=[2, self.config.embedding_size],
                                    initializer=layers.xavier_initializer(),
                                    name="special_token")
        # load pre-trained GloVe
        embedding_matrix = tf.Variable(initial_value=self.config.embeddings,
                                       trainable=False,
                                       dtype=tf.float32,
                                       name="embedding")
        self.embedding_matrix = tf.concat([zeros, unk_dummy, embedding_matrix],
                                          axis=0)
        self.embedded_sentences = tf.nn.embedding_lookup(
            self.embedding_matrix, self.sentences)
        self.embedded_sentences = tf.layers.dropout(self.embedded_sentences,
                                                    self.dropout)
        self.embedded_context = tf.nn.embedding_lookup(self.embedding_matrix,
                                                       self.contexts)
        self.embedded_context = tf.layers.dropout(self.embedded_context,
                                                  self.dropout)
        self.embedded_questions = tf.nn.embedding_lookup(
            self.embedding_matrix, self.questions)
        self.embedded_questions = tf.layers.dropout(self.embedded_questions,
                                                    self.dropout)
        # conv block and self attention block
        with tf.variable_scope("Embedding_Encoder_Layer"):
            contexts = self.residual_block(self.embedded_context,
                                           self.context_legnths,
                                           num_blocks=1,
                                           num_conv_blocks=4,
                                           kernel_size=7,
                                           num_filters=128,
                                           scope="Embedding_Encoder",
                                           reuse=False)
            questions = self.residual_block(self.embedded_questions,
                                            self.question_legnths,
                                            num_blocks=1,
                                            num_conv_blocks=4,
                                            kernel_size=7,
                                            num_filters=128,
                                            scope="Embedding_Encoder",
                                            reuse=True)
            reshaped_sentences = tf.reshape(
                self.embedded_sentences,
                [-1, self.word_size, self.config.embedding_size])
            sentence_len = tf.reshape(self.sequence_lengths, [-1])
            encoded_sentence = self.residual_block(reshaped_sentences,
                                                   sentence_len,
                                                   num_blocks=1,
                                                   num_conv_blocks=1,
                                                   kernel_size=7,
                                                   num_filters=128,
                                                   scope="Embedding_Encoder",
                                                   reuse=True)

        with tf.variable_scope("hierarchical_attention") and tf.device(
                "/device:GPU:0"):
            # [b * s, w, d]
            cnn_inputs = tf.layers.dense(
                encoded_sentence,
                self.config.filter_size,
                kernel_regularizer=self.regularizer,
                kernel_initializer=layers.xavier_initializer(),
                activation=tf.nn.relu)
            sentence_cnn = self.conv_encoder(cnn_inputs,
                                             self.config.filter_size,
                                             scope="word_encoder",
                                             reuse=False)
            encoded_question = self.question_encoding(questions,
                                                      self.question_legnths)
            # [b, s, d]
            sentence_vectors = self.word_level_attention(
                encoded_question, sentence_cnn, self.document_size,
                self.sentence_size, self.word_size, self.sequence_lengths)
            sentence_cnn = self.conv_encoder(sentence_vectors,
                                             self.config.filter_size,
                                             scope="sentence_encoder",
                                             reuse=False)
            document_vector, sentence_score = self.sentence_level_attention(
                encoded_question, sentence_cnn, self.sentence_size,
                self.sentence_lengths)

            self.attention_loss, self.binary_loss = self.auxiliary_loss(
                sentence_score, document_vector, encoded_question)
        with tf.variable_scope("Context_Query_Attention_Layer") and tf.device(
                "/device:GPU:0"):
            A, B = self.co_attention(questions, contexts,
                                     self.question_legnths,
                                     self.context_legnths)
            attention_outputs = [contexts, A, contexts * A, contexts * B]
        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=2)
            inputs = tf.layers.dense(
                inputs,
                self.config.attention_size,
                kernel_regularizer=self.regularizer,
                kernel_initializer=layers.variance_scaling_initializer(),
                activation=tf.nn.relu)
            memories = []
            for i in range(3):
                outputs = self.residual_block(inputs,
                                              self.context_legnths,
                                              num_blocks=7,
                                              num_conv_blocks=2,
                                              num_filters=128,
                                              kernel_size=5,
                                              scope="Model_Encoder",
                                              reuse=True if i > 0 else False)
                if i == 2:
                    outputs = tf.layers.dropout(outputs, self.dropout)
                memories.append(outputs)
                inputs = outputs

        with tf.variable_scope("Output_Layer") and tf.device("/device:GPU:0"):
            logits_inputs = tf.concat([memories[0], memories[1]], axis=2)
            start_logits = self.pointer_network(document_vector,
                                                logits_inputs,
                                                self.context_legnths,
                                                scope="start_logits")
            logits_inputs = tf.concat([memories[0], memories[2]], axis=2)
            end_logits = self.pointer_network(document_vector,
                                              logits_inputs,
                                              self.context_legnths,
                                              scope="end_logits")

            start_label, end_label = tf.split(self.answer_span, 2, axis=1)
            start_label = tf.squeeze(start_label, axis=-1)
            end_label = tf.squeeze(end_label, axis=-1)
            losses1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=start_logits, labels=start_label)
            losses2 = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=end_logits, labels=end_label)
            cross_entropy_loss = tf.reduce_mean(losses1 + losses2)
            self.loss = cross_entropy_loss \
                        + self.config.alpha * self.attention_loss \
                        + self.config.beta * self.binary_loss

        # for inference
        logits1 = tf.nn.softmax(start_logits)
        logits2 = tf.nn.softmax(end_logits)
        outer_product = tf.matmul(tf.expand_dims(logits1, axis=2),
                                  tf.expand_dims(logits2, axis=1))
        outer = tf.matrix_band_part(outer_product, 0, self.config.ans_limit)
        self.start = tf.argmax(tf.reduce_max(outer, axis=2),
                               axis=1,
                               output_type=tf.int32)
        self.end = tf.argmax(tf.reduce_max(outer, axis=1),
                             axis=1,
                             output_type=tf.int32)
        self.em = self.evaluate_em(self.start, self.end, self.answer_span,
                                   self.unans_prob)
        if self.config.l2_lambda > 0:
            vars = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = layers.apply_regularization(self.regularizer, vars)
            self.loss += l2_loss
        # Exponential moving average
        self.var_ema = tf.train.ExponentialMovingAverage(0.9999)
        ema_op = self.var_ema.apply(tf.trainable_variables())
        with tf.control_dependencies([ema_op]):
            self.loss = tf.identity(self.loss)

            self.assign_vars = []
            for var in tf.global_variables():
                v = self.var_ema.average(var)
                if v:
                    self.assign_vars.append(tf.assign(var, v))

        self.add_train_op()
        self.init_session()
    def call(self, inputs, mask=None, **kwargs):
        if isinstance(inputs, list):
            inputs, positions = inputs
            positions = K.cast(positions, 'int32')
            mask = mask[1]
        else:
            positions = None

        input_len = K.shape(inputs)[1]

        if self.attention_type == SeqSelfAttention.ATTENTION_TYPE_ADD:
            e = self._call_additive_emission(inputs)
        elif self.attention_type == SeqSelfAttention.ATTENTION_TYPE_MUL:
            e = self._call_multiplicative_emission(inputs)

        if self.attention_activation is not None:
            e = self.attention_activation(e)
        e = K.exp(e - K.max(e, axis=-1, keepdims=True))
        if self.attention_width is not None:
            ones = tf.ones((input_len, input_len))
            if self.history_only:
                local = tf.matrix_band_part(
                    ones,
                    K.minimum(input_len, self.attention_width - 1),
                    0,
                )
            else:
                local = tf.matrix_band_part(
                    ones,
                    K.minimum(input_len, self.attention_width // 2),
                    K.minimum(input_len, (self.attention_width - 1) // 2),
                )
            e = e * K.expand_dims(local, 0)
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            mask = K.expand_dims(mask)
            e = K.permute_dimensions(
                K.permute_dimensions(e * mask, (0, 2, 1)) * mask, (0, 2, 1))

        # a_{t} = \text{softmax}(e_t)
        s = K.sum(e, axis=-1)
        s = K.tile(K.expand_dims(s, axis=-1), K.stack([1, 1, input_len]))
        a = e / (s + K.epsilon())

        # l_t = \sum_{t'} a_{t, t'} x_{t'}
        v = K.batch_dot(a, inputs)
        if self.attention_regularizer_weight > 0.0:
            self.add_loss(self._attention_regularizer(a))

        if positions is not None:
            pos_num = K.shape(positions)[1]
            batch_indices = K.tile(
                K.expand_dims(K.arange(K.shape(inputs)[0]), axis=-1),
                K.stack([1, pos_num]))
            pos_indices = K.stack([batch_indices, positions], axis=-1)
            v = tf.gather_nd(v, pos_indices)
            a = tf.gather_nd(a, pos_indices)

        if self.return_attention:
            return [v, a]
        return v
Example #28
0
def concordance_index4(y_true, y_pred):
  y_true_1 = tf.expand_dims(y_true, 0)
  y_true_2 = tf.expand_dims(y_true, 1)
  y_pred_1 = tf.expand_dims(y_pred, 0)
  y_pred_2 = tf.expand_dims(y_pred, 1)
  y_true_diff = tf.sign(tf.subtract(y_true_1, y_true_2))
  y_true_diff = tf.matrix_band_part(y_true_diff, 0, -1)
  y_pred_diff = tf.sign(tf.subtract(y_pred_1, y_pred_2))
  y_pred_diff = tf.matrix_band_part(y_pred_diff, 0, -1)
  ones = tf.ones_like(y_pred_diff)
  mask_a = tf.matrix_band_part(ones, 0, -1)
  mask_b = tf.matrix_band_part(ones, 0, 0)
  mask = tf.cast(mask_a - mask_b, dtype=tf.bool)
  sess = tf.Session()
  #sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
  mask = sess.run(mask)
  y_pred_diff = sess.run(y_pred_diff)
  y_true_diff = sess.run(y_true_diff)
  CPU_COUNT = int(0.5*os.cpu_count())

  with Pool(processes=CPU_COUNT) as pool:
    #pdb.set_trace()
    #procs_pred = []
    #for i in len(mask):
    time_start = time.time()
    procs_pred = [pool.apply_async(taking, [maski, y_pred_diff_i]) for (maski, y_pred_diff_i) 
      in zip(mask, y_pred_diff)]
    results_pred = [proc.get() for proc in procs_pred]
    time_start1 = time.time()
    y_pred_diff_flat = np.array(list(itertools.chain.from_iterable(results_pred)))
    time_end1 = time.time()
    #y_pred_diff_flat = np.array(results_pred)
    # for res in results_pred:
    #   y_pred_diff_flat = np.append(y_pred_diff_flat, res)
    procs_true = [pool.apply_async(taking, [maski, y_true_diff_i]) for (maski, y_true_diff_i) 
      in zip(mask, y_true_diff)]
    results_true = [proc.get() for proc in procs_true]
    time_start2 = time.time()
    y_true_diff_flat = np.array(list(itertools.chain.from_iterable(results_true)))
    time_end2 = time.time()
    print("time used in flatting arrays: ", time_end2+time_end1-time_start2-time_start1)
    print("time used in CPU: ", time_end2-time_start)
    #y_true_diff_flat = np.array(results_true)
    # y_pred_diff_flat = np.array([])
    # for res in results_pred:
    #   y_pred_diff_flat = np.append(y_pred_diff_flat, res)
  #pdb.set_trace()
  # y_pred_diff_flat = y_pred_diff[mask]
  # pdb.set_trace()
  # y_true_diff_flat = tf.boolean_mask(y_true_diff, mask)
  # y_pred_diff_flat = tf.concat(results_pred, 0)
  # result = sess.run(y_pred_diff_flat)  

  valid_pairs = tf.not_equal(y_true_diff_flat, 0.0)
  valid_pairs = tf.cast(valid_pairs, dtype=tf.float64)
    
  raw_comparison = tf.divide(tf.add(tf.multiply(y_true_diff_flat, y_pred_diff_flat), 1), 2)
  scores = tf.multiply(raw_comparison, valid_pairs)
  quotient = tf.reduce_sum(scores)/tf.reduce_sum(valid_pairs) 
  
  quotient = sess.run(quotient)
  return quotient
Example #29
0
def model_fn(features,
             labels,
             mode,
             params,
             word_embeddings_np=None,
             char_embeddings_np=None):
    attention_fun = partial(BahdanauAttention, num_units=params.units) if params.attention == 'bahdanau' \
        else partial(LuongAttention, num_units=2 * params.units)

    dropout = params.dropout if mode == tf.estimator.ModeKeys.TRAIN else 0.0
    passage_count = params.passage_count if mode != tf.estimator.ModeKeys.TRAIN \
        else params.train_passage_count

    question_words_length = features['question_length']
    passage_words_length = features['passage_length']

    devices = get_devices()

    with tf.device('/cpu:0'):
        word_embeddings_placeholder = tf.placeholder(
            shape=[params.vocab_size, params.emb_size], dtype=tf.float32)
        char_embeddings_placeholder = tf.placeholder(
            shape=[params.char_vocab_size, params.char_emb_size],
            dtype=tf.float32)

        # word_embeddings = tf.create_partitioned_variables(shape=[params.vocab_size, params.emb_size],
        #                                                   slicing=[10, 1],
        #                                                   initializer=word_embeddings_placeholder,
        #                                                   trainable=False, name="word_embeddings")
        word_embeddings = tf.Variable(word_embeddings_placeholder,
                                      trainable=False,
                                      name="word_embeddings")
        char_embeddings = tf.Variable(char_embeddings_placeholder,
                                      trainable=False,
                                      name="char_embeddings")

        word_embeddings = tf.nn.dropout(word_embeddings,
                                        1.0 - dropout,
                                        noise_shape=[params.vocab_size, 1])
        char_embeddings = tf.nn.dropout(
            char_embeddings,
            1.0 - dropout,
            noise_shape=[params.char_vocab_size, 1])

    question_words_emb = tf.nn.embedding_lookup(word_embeddings,
                                                features['question_words'])
    question_chars_emb = tf.nn.embedding_lookup(char_embeddings,
                                                features['question_chars'])

    passage_words_emb = tf.nn.embedding_lookup(word_embeddings,
                                               features['passage_words'])
    passage_chars_emb = tf.nn.embedding_lookup(char_embeddings,
                                               features['passage_chars'])

    with tf.device(next(devices)):
        with tf.variable_scope('question_encoding'):
            question_enc = encoder(question_words_emb,
                                   question_words_length,
                                   question_chars_emb,
                                   features['question_char_length'],
                                   params,
                                   dropout=dropout)

    with tf.device(next(devices)):
        with tf.variable_scope('passage_encoding'):
            passage_enc = encoder(passage_words_emb,
                                  passage_words_length,
                                  passage_chars_emb,
                                  features['passage_char_length'],
                                  params,
                                  dropout=dropout)
        # question_enc = tf.Print(question_enc, [question_enc], summarize=1000)

        with tf.variable_scope('attention'):
            attention = attention_fun(
                memory=question_enc,
                memory_sequence_length=question_words_length)
            cell_fw = GatedAttentionWrapper(
                attention,
                DropoutWrapper(
                    GRUCell(params.units, name="attention_gru"),
                    # output_keep_prob=1.0 - dropout,
                    input_keep_prob=1.0 - dropout,
                    # state_keep_prob=1.0 - dropout,
                    variational_recurrent=True,
                    input_size=4 * params.units,
                    dtype=tf.float32),
                dropout=0)

            cell_bw = GatedAttentionWrapper(
                attention,
                DropoutWrapper(
                    GRUCell(params.units, name="attention_gru"),
                    # output_keep_prob=1.0 - dropout,
                    input_keep_prob=1.0 - dropout,
                    # state_keep_prob=1.0 - dropout
                    variational_recurrent=True,
                    input_size=4 * params.units,
                    dtype=tf.float32),
                dropout=0)

            passage_repr, _ = tf.nn.bidirectional_dynamic_rnn(
                cell_fw,
                cell_bw,
                passage_enc,
                passage_words_length,
                dtype=tf.float32)
            passage_repr = tf.concat(passage_repr, -1)

        with tf.variable_scope('pointer'):
            question_att = attention_fun(
                memory=question_enc,
                memory_sequence_length=question_words_length,
                name="question_align")

            pool_param = tf.get_variable('pool_param',
                                         shape=(question_att._num_units, ),
                                         initializer=tf.initializers.ones)
            pool_param = tf.reshape(
                tf.tile(pool_param, [tf.shape(question_enc)[0]]),
                (-1, question_att._num_units))

            question_alignments, _ = question_att(pool_param, None)
            question_pool = tf.reduce_sum(
                tf.expand_dims(question_alignments, -1) * question_enc, 1)

            logits1, logits2 = pointer_net(passage_repr,
                                           passage_words_length,
                                           question_pool,
                                           params,
                                           attention_fun=attention_fun,
                                           dropout=dropout)

        outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                          tf.expand_dims(tf.nn.softmax(logits2), axis=1))
        outer = tf.matrix_band_part(outer, 0, 15)
        p1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
        p2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)

        if mode == tf.estimator.ModeKeys.PREDICT:
            predictions = {'start': p1, 'end': p2}
            export_outputs = {
                'prediction': tf.estimator.export.PredictOutput(predictions)
            }

            return tf.estimator.EstimatorSpec(mode,
                                              predictions=predictions,
                                              export_outputs=export_outputs)

        with tf.variable_scope('passage_ranking'):
            W_g = Dense(params.units, activation=tf.tanh, use_bias=False)
            v_g = Dense(1, use_bias=False)

            memory_layer = Dense(params.units,
                                 name="memory_layer",
                                 use_bias=False,
                                 dtype=tf.float32)
            query_layer = Dense(params.units,
                                name="query_layer",
                                use_bias=False,
                                dtype=tf.float32)
            g = []

            for i in range(passage_count):
                passage_mask = tf.boolean_mask(
                    passage_repr, tf.equal(features['partitions'], i))
                passage_i = tf.split(passage_mask,
                                     features['partitions_len'][:, i])
                passage_i = [
                    pad_to_shape_2d(
                        p, (tf.Dimension(params.passage_max_len), p.shape[1]))
                    for p in passage_i
                ]
                passage_i = tf.stack(passage_i)

                passage_alignment, _ = ReusableBahdanauAttention(
                    params.units,
                    passage_i,
                    features['partitions_len'][:, i],
                    memory_layer=memory_layer,
                    query_layer=query_layer,
                    name="passage_align")(question_pool, None)

                passage_pool = tf.reduce_sum(
                    tf.expand_dims(passage_alignment, -1) * passage_i, 1)
                g_i = v_g(W_g(tf.concat([question_pool, passage_pool], -1)))

                # g_i = tf.Print(g_i, [passage_mask, passage_i], message='is_nan_{}'.format(i), summarize=1000)
                g.append(g_i)

            g = tf.concat(g, -1)

    answer_start, answer_end, passage_rank = labels

    loss1 = tf.nn.softmax_cross_entropy_with_logits_v2(
        logits=logits1, labels=tf.stop_gradient(answer_start))
    loss2 = tf.nn.softmax_cross_entropy_with_logits_v2(
        logits=logits2, labels=tf.stop_gradient(answer_end))

    loss3 = tf.nn.softmax_cross_entropy_with_logits_v2(
        logits=g, labels=tf.stop_gradient(passage_rank))

    # loss1 = tf.Print(loss1, [tf.argmax(answer_start, -1), tf.argmax(answer_end, -1),
    #                          tf.reduce_mean(loss1), tf.reduce_mean(loss2), tf.reduce_mean(loss3)], message="loss")

    loss = (params.r * tf.reduce_mean(loss1 + loss2) + (1 - params.r) * tf.reduce_mean(loss3)) \
        if params.r < 1 else tf.reduce_mean(loss1 + loss2)

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdadeltaOptimizer(
            learning_rate=params.learning_rate, epsilon=1e-6)
        global_step = tf.train.get_or_create_global_step()

        grads = optimizer.compute_gradients(loss)
        gradients, variables = zip(*grads)
        capped_grads, _ = tf.clip_by_global_norm(gradients, params.grad_clip)
        train_op = optimizer.apply_gradients(zip(capped_grads, variables),
                                             global_step=global_step)

        return EstimatorSpec(
            mode,
            loss=loss,
            train_op=train_op,
            scaffold=tf.train.Scaffold(
                init_feed_dict={
                    word_embeddings_placeholder: word_embeddings_np,
                    char_embeddings_placeholder: char_embeddings_np
                }),
        )

    if mode == tf.estimator.ModeKeys.EVAL:
        table = lookup_ops.index_to_string_table_from_file(
            params.word_vocab_file, value_column_index=0, delimiter=" ")
        return EstimatorSpec(mode,
                             loss=loss,
                             eval_metric_ops={
                                 'rouge-l':
                                 extraction_metric(p1, p2,
                                                   tf.argmax(answer_start, -1),
                                                   tf.argmax(answer_end, -1),
                                                   features['passage_words'],
                                                   params, table),
                                 'f1':
                                 extraction_metric(p1,
                                                   p2,
                                                   tf.argmax(answer_start, -1),
                                                   tf.argmax(answer_end, -1),
                                                   features['passage_words'],
                                                   params,
                                                   table,
                                                   metric='f1')
                             })
Example #30
0
 def getLowerDiag(inputs):
     inputs_matrix = tf.reshape(tf.tile(inputs, [tf.shape(inputs)[0]]),
                                [-1, tf.shape(inputs)[0]])
     result = tf.matrix_band_part(inputs_matrix, -1, 0)
     return result
Example #31
0
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = cudnn_gru if config.use_cudnn else native_gru

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                ch_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.ch),
                    [N * PL, CL, dc])
                qh_emb = tf.reshape(
                    tf.nn.embedding_lookup(self.char_mat, self.qh),
                    [N * QL, CL, dc])
                ch_emb = dropout(ch_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)
                qh_emb = dropout(qh_emb,
                                 keep_prob=config.keep_prob,
                                 is_train=self.is_train)
                cell_fw = tf.contrib.rnn.GRUCell(dg)
                cell_bw = tf.contrib.rnn.GRUCell(dg)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32)
                ch_emb = tf.concat([state_fw, state_bw], axis=1)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32)
                qh_emb = tf.concat([state_fw, state_bw], axis=1)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3,
                      num_units=d,
                      batch_size=N,
                      input_size=c_emb.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c,
                                   q,
                                   mask=self.q_mask,
                                   hidden=d,
                                   keep_prob=config.keep_prob,
                                   is_train=self.is_train)
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=qc_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)

        with tf.variable_scope("match"):
            self_att = dot_attention(att,
                                     att,
                                     mask=self.c_mask,
                                     hidden=d,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
            rnn = gru(num_layers=1,
                      num_units=d,
                      batch_size=N,
                      input_size=self_att.get_shape().as_list()[-1],
                      keep_prob=config.keep_prob,
                      is_train=self.is_train)
            match = rnn(self_att, seq_len=self.c_len)

        with tf.variable_scope("pointer"):
            init = summ(q[:, :, -2 * d:],
                        d,
                        mask=self.q_mask,
                        keep_prob=config.ptr_keep_prob,
                        is_train=self.is_train)
            pointer = ptr_net(batch=N,
                              hidden=init.get_shape().as_list()[-1],
                              keep_prob=config.ptr_keep_prob,
                              is_train=self.is_train)
            logits1, logits2 = pointer(init, match, d, self.c_mask)

        with tf.variable_scope("predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)
Example #32
0
 def _random_tril_matrix(self, shape):
     mat = self.rng.rand(*shape)
     chol = tfd.matrix_diag_transform(mat, transform=tf.nn.softplus)
     return tf.matrix_band_part(chol, -1, 0)
Example #33
0
def uncertain_conditional(Xnew_mu, Xnew_var, feat, kern, q_mu, q_sqrt, *,
                          mean_function=None, full_output_cov=False, full_cov=False, white=False):
    """
    Calculates the conditional for uncertain inputs Xnew, p(Xnew) = N(Xnew_mu, Xnew_var).
    See ``conditional`` documentation for further reference.

    :param Xnew_mu: mean of the inputs, size N x Din
    :param Xnew_var: covariance matrix of the inputs, size N x Din x Din
    :param feat: gpflow.InducingFeature object, only InducingPoints is supported
    :param kern: gpflow kernel or ekernel object.
    :param q_mu: mean inducing points, size M x Dout
    :param q_sqrt: cholesky of the covariance matrix of the inducing points, size Dout x M x M
    :param full_output_cov: boolean wheter to compute covariance between output dimension.
                            Influences the shape of return value ``fvar``. Default is False
    :param white: boolean whether to use whitened representation. Default is False.

    :return fmean, fvar: mean and covariance of the conditional, size ``fmean`` is N x Dout,
            size ``fvar`` depends on ``full_output_cov``: if True ``f_var`` is N x Dout x Dout,
            if False then ``f_var`` is N x Dout
    """

    # TODO(VD): Tensorflow 1.7 doesn't support broadcasting in``tf.matmul`` and
    # ``tf.matrix_triangular_solve``. This is reported in issue 216.
    # As a temporary workaround, we are using ``tf.einsum`` for the matrix
    # multiplications and tiling in the triangular solves.
    # The code that should be used once the bug is resolved is added in comments.

    if not isinstance(feat, InducingPoints):
        raise NotImplementedError

    if full_cov:
        # TODO(VD): ``full_cov`` True would return a ``fvar`` of shape N x N x D x D,
        # encoding the covariance between input datapoints as well.
        # This is not implemented as this feature is only used for plotting purposes.
        raise NotImplementedError

    pXnew = Gaussian(Xnew_mu, Xnew_var)

    num_data = tf.shape(Xnew_mu)[0]  # number of new inputs (N)
    num_ind = tf.shape(q_mu)[0]  # number of inducing points (M)
    num_func = tf.shape(q_mu)[1]  # output dimension (D)

    q_sqrt_r = tf.matrix_band_part(q_sqrt, -1, 0)  # D x M x M

    eKuf = tf.transpose(expectation(pXnew, (kern, feat)))  # M x N (psi1)
    Kuu = feat.Kuu(kern, jitter=settings.numerics.jitter_level)  # M x M
    Luu = tf.cholesky(Kuu)  # M x M

    if not white:
        q_mu = tf.matrix_triangular_solve(Luu, q_mu, lower=True)
        Luu_tiled = tf.tile(Luu[None, :, :], [num_func, 1, 1])  # remove line once issue 216 is fixed
        q_sqrt_r = tf.matrix_triangular_solve(Luu_tiled, q_sqrt_r, lower=True)

    Li_eKuf = tf.matrix_triangular_solve(Luu, eKuf, lower=True)  # M x N
    fmean = tf.matmul(Li_eKuf, q_mu, transpose_a=True)

    eKff = expectation(pXnew, kern)  # N (psi0)
    eKuffu = expectation(pXnew, (kern, feat), (kern, feat))  # N x M x M (psi2)
    Luu_tiled = tf.tile(Luu[None, :, :], [num_data, 1, 1])  # remove this line, once issue 216 is fixed
    Li_eKuffu = tf.matrix_triangular_solve(Luu_tiled, eKuffu, lower=True)
    Li_eKuffu_Lit = tf.matrix_triangular_solve(Luu_tiled, tf.matrix_transpose(Li_eKuffu), lower=True)  # N x M x M
    cov = tf.matmul(q_sqrt_r, q_sqrt_r, transpose_b=True)  # D x M x M

    if mean_function is None or isinstance(mean_function, mean_functions.Zero):
        e_related_to_mean = tf.zeros((num_data, num_func, num_func), dtype=settings.float_type)
    else:
        # Update mean: \mu(x) + m(x)
        fmean = fmean + expectation(pXnew, mean_function)

        # Calculate: m(x) m(x)^T + m(x) \mu(x)^T + \mu(x) m(x)^T,
        # where m(x) is the mean_function and \mu(x) is fmean
        e_mean_mean = expectation(pXnew, mean_function, mean_function)  # N x D x D
        Lit_q_mu = tf.matrix_triangular_solve(Luu, q_mu, adjoint=True)
        e_mean_Kuf = expectation(pXnew, mean_function, (kern, feat))  # N x D x M
        # einsum isn't able to infer the rank of e_mean_Kuf, hence we explicitly set the rank of the tensor:
        e_mean_Kuf = tf.reshape(e_mean_Kuf, [num_data, num_func, num_ind])
        e_fmean_mean = tf.einsum("nqm,mz->nqz", e_mean_Kuf, Lit_q_mu)  # N x D x D
        e_related_to_mean = e_fmean_mean + tf.matrix_transpose(e_fmean_mean) + e_mean_mean

    if full_output_cov:
        fvar = (
                tf.matrix_diag(tf.tile((eKff - tf.trace(Li_eKuffu_Lit))[:, None], [1, num_func])) +
                tf.matrix_diag(tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov)) +
                # tf.matrix_diag(tf.trace(tf.matmul(Li_eKuffu_Lit, cov))) +
                tf.einsum("ig,nij,jh->ngh", q_mu, Li_eKuffu_Lit, q_mu) -
                # tf.matmul(q_mu, tf.matmul(Li_eKuffu_Lit, q_mu), transpose_a=True) -
                fmean[:, :, None] * fmean[:, None, :] +
                e_related_to_mean
        )
    else:
        fvar = (
                (eKff - tf.trace(Li_eKuffu_Lit))[:, None] +
                tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov) +
                tf.einsum("ig,nij,jg->ng", q_mu, Li_eKuffu_Lit, q_mu) -
                fmean ** 2 +
                tf.matrix_diag_part(e_related_to_mean)
        )

    return fmean, fvar
Example #34
0
 def call(self, inputs, q_mask=False, v_mask=False, a_mask=False):
     """实现多头注意力
     q_mask: 对输入的query序列的mask。
             主要是将输出结果的padding部分置0。
     v_mask: 对输入的value序列的mask。
             主要是防止attention读取到padding信息。
     a_mask: 对attention矩阵的mask。
             不同的attention mask对应不同的应用。
     """
     q, k, v = inputs[:3]
     # 处理mask
     idx = 3
     if q_mask:
         q_mask = inputs[idx]
         idx += 1
     else:
         q_mask = None
     if v_mask:
         v_mask = inputs[idx]
         idx += 1
     else:
         v_mask = None
     if a_mask:
         if len(inputs) > idx:
             a_mask = inputs[idx]
         else:
             a_mask = 'history_only'
     else:
         a_mask = None
     # 线性变换
     qw = self.q_dense(q)
     kw = self.k_dense(k)
     vw = self.v_dense(v)
     # 形状变换
     qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
     kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
     vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
     # 维度置换
     qw = K.permute_dimensions(qw, (0, 2, 1, 3))
     kw = K.permute_dimensions(kw, (0, 2, 1, 3))
     vw = K.permute_dimensions(vw, (0, 2, 1, 3))
     # 转为三阶张量
     qw = K.reshape(qw, (-1, K.shape(q)[1], self.key_size))
     kw = K.reshape(kw, (-1, K.shape(k)[1], self.key_size))
     vw = K.reshape(vw, (-1, K.shape(v)[1], self.head_size))
     # Attention
     a = K.batch_dot(qw, kw, [2, 2]) / np.sqrt(self.key_size)
     a = add_seq_mask(a, v_mask, 1, -1, self.heads)
     if a_mask is not None:
         if a_mask == 'history_only':
             ones = K.ones_like(a[:1])
             a_mask = (ones - tf.matrix_band_part(ones, -1, 0)) * 1e12
             a = a - a_mask
         else:
             a = a - (1 - a_mask) * 1e12
     a = K.softmax(a)
     # 完成输出
     o = K.batch_dot(a, vw, [2, 1])
     o = K.reshape(o, (-1, self.heads, K.shape(q)[1], self.head_size))
     o = K.permute_dimensions(o, (0, 2, 1, 3))
     o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
     o = self.o_dense(o)
     o = add_seq_mask(o, q_mask, 0)
     return o
Example #35
0
    def __init__(self, lr, batch_size, dimension, util_train, util_test,
                 campaign, reg_lambda, sigma):
        # hyperparameters
        self.lr = lr
        self.batch_size = batch_size
        self.util_train = util_train
        self.util_test = util_test
        self.reg_lambda = reg_lambda
        self.sigma = sigma
        self.emb_size = 20

        self.train_data_amt = util_train.get_data_amt()
        self.test_data_amt = util_test.get_data_amt()

        # output dir
        model_name = "{}_{}_{}_{}".format(self.lr, self.reg_lambda,
                                          self.batch_size, self.sigma)
        self.output_dir = "output/deephit/{}/{}/".format(campaign, model_name)
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        # reset graph
        tf.reset_default_graph()

        # field params
        self.field_sizes = self.util_train.feat_sizes
        self.field_num = len(self.field_sizes)

        # placeholders
        self.X = [
            tf.sparse_placeholder(tf.float64)
            for i in range(0, self.field_num)
        ]
        self.z = tf.placeholder(tf.float64)
        self.b = tf.placeholder(tf.float64)
        self.y = tf.placeholder(tf.float64)

        # embedding layer
        self.var_map = {}
        # for truncated
        self.var_map['embed_0'] = tf.Variable(
            tf.truncated_normal([self.field_sizes[0], 1], dtype=tf.float64))
        for i in range(1, self.field_num):
            self.var_map['embed_%d' % i] = tf.Variable(
                tf.truncated_normal([self.field_sizes[i], self.emb_size],
                                    dtype=tf.float64))

        # after embedding
        w0 = [self.var_map['embed_%d' % i] for i in range(self.field_num)]
        self.dense_input = tf.concat([
            tf.sparse_tensor_dense_matmul(self.X[i], w0[i])
            for i in range(self.field_num)
        ], 1)

        # shared network
        self.hidden1 = tf.Variable(initial_value=tf.truncated_normal(
            shape=[(self.field_num - 1) * self.emb_size + 1, HIDDEN_SIZE1],
            dtype=tf.float64),
                                   name='h1')
        self.out1 = tf.Variable(initial_value=tf.truncated_normal(
            shape=[HIDDEN_SIZE1, OUT_SIZE1], dtype=tf.float64),
                                name='o1')
        self.hidden2 = tf.Variable(initial_value=tf.truncated_normal(
            shape=[OUT_SIZE1, HIDDEN_SIZE2], dtype=tf.float64),
                                   name='h2')
        self.out2 = tf.Variable(initial_value=tf.truncated_normal(
            shape=[HIDDEN_SIZE2, OUT_SIZE2], dtype=tf.float64),
                                name='o2')

        # cause-specific network
        self.hidden1_val = tf.nn.relu(tf.matmul(self.dense_input,
                                                self.hidden1))
        self.out1_val = tf.sigmoid(tf.matmul(self.hidden1_val, self.out1))
        self.hidden2_val = tf.nn.relu(tf.matmul(self.out1_val, self.hidden2))
        self.out2_val = tf.sigmoid(tf.matmul(self.hidden2_val, self.out2))

        # p_z and w_b
        self.p = tf.nn.softmax(self.out2_val)
        self.w = tf.cumsum(self.p, exclusive=True, axis=1)

        idx_z = tf.stack([
            tf.reshape(tf.range(tf.shape(self.z)[0]), (-1, 1)),
            tf.cast(self.z - 1, tf.int32)
        ],
                         axis=-1)
        idx_b = tf.stack([
            tf.reshape(tf.range(tf.shape(self.b)[0]), (-1, 1)),
            tf.cast(self.b - 1, tf.int32)
        ],
                         axis=-1)

        self.pz = tf.gather_nd(self.p, idx_z)
        self.wb = tf.gather_nd(self.w, idx_b)
        self.wz = tf.gather_nd(self.w, idx_z)

        # loss and train step
        self.loss1 = -tf.reduce_sum(
            tf.log(tf.clip_by_value(self.pz, 1e-8, 1.0)) * self.y)
        self.loss2 = -tf.reduce_sum(
            tf.log(tf.clip_by_value(1 - self.wb, 1e-8, 1.0)) * (1 - self.y))
        self.reg_loss = tf.nn.l2_loss(self.hidden1[1:,]) + tf.nn.l2_loss(self.hidden2[1:,]) + \
                        tf.nn.l2_loss(self.out1[1:,]) + tf.nn.l2_loss(self.out2[1:,])

        # get ranking loss
        self.w_of_pair = tf.transpose(
            tf.nn.embedding_lookup(tf.transpose(self.w),
                                   tf.cast(self.z[:, 0] - 1, tf.int32)))
        self.w_of_self = tf.reshape(
            tf.tile(tf.reshape(self.wz, (self.batch_size, )),
                    [self.batch_size]), (self.batch_size, self.batch_size))
        self.win_label = tf.reshape(
            tf.tile(tf.reshape(self.y, (self.batch_size, )),
                    [self.batch_size]), (self.batch_size, self.batch_size))
        self.delta = self.w_of_self - self.w_of_pair
        self.candidate = tf.exp(-self.delta / self.sigma)
        self.rank_loss = tf.reduce_sum(
            tf.matrix_band_part(self.candidate, -1, 0) * self.win_label)

        self.loss = self.loss1 + self.loss2 + self.reg_lambda * self.reg_loss + self.rank_loss

        self.optimizer = tf.train.GradientDescentOptimizer(self.lr)
        self.train_step = self.optimizer.minimize(self.loss)

        # session initialization
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        tf.global_variables_initializer().run(session=self.sess)
Example #36
0
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = cudnn_gru if config.use_cudnn else native_gru

        gi = []
        att_vP = []

        for i in range(config.max_para):
            print(i)
            with tf.variable_scope("emb" + str(i)):
                with tf.variable_scope("char" + str(i)):
                    #CL = tf.Print(CL,[CL],message="CL:")
                    #PL = tf.Print(PL,[PL],message="PL:")
                    #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr.get_shape()],message="ch_pr:")
                    self.ch_pr_ = self.ch_pr[:, i * config.para_limit:(i + 1) *
                                             config.para_limit, :]
                    print(self.ch_pr_.get_shape())
                    #self.c_pr = tf.reshape(self.c_pr, [N, 12, PL])
                    #print(self.ch.get_shape())
                    #print(self.ch_pr.get_shape())
                    #print(self.c.get_shape())
                    #print(self.c_pr.get_shape())
                    #self.ch_pr = tf.Print(self.ch_pr,[self.ch_pr[:,2:,:]],message="ch_pr")
                    ch_emb = tf.reshape(tf.nn.embedding_lookup(\
                     self.char_mat, self.ch_pr_), [N * PL, CL, dc])
                    #	self.char_mat, self.ch), [N * PL, CL, dc])
                    qh_emb = tf.reshape(
                        tf.nn.embedding_lookup(self.char_mat, self.qh),
                        [N * QL, CL, dc])
                    ch_emb = dropout(ch_emb,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
                    #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb")
                    #qh_emb = tf.Print(qh_emb,[qh_emb],message="qh_emb")
                    qh_emb = dropout(qh_emb,
                                     keep_prob=config.keep_prob,
                                     is_train=self.is_train)
                    cell_fw = tf.contrib.rnn.GRUCell(dg)
                    cell_bw = tf.contrib.rnn.GRUCell(dg)
                    _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw,
                        cell_bw,
                        ch_emb,
                        self.ch_len,
                        dtype=tf.float32)
                    ch_emb = tf.concat([state_fw, state_bw], axis=1)
                    _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                        cell_fw,
                        cell_bw,
                        qh_emb,
                        self.qh_len,
                        dtype=tf.float32)
                    #state_fw = tf.Print(state_fw,[state_fw],message="state_fw")
                    #state_bw = tf.Print(state_bw,[state_bw],message="state_bw")
                    qh_emb = tf.concat([state_fw, state_bw], axis=1)
                    qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                    ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])
                    #ch_emb = tf.Print(ch_emb,[ch_emb],message="ch_emb")
                with tf.name_scope("word" + str(i)):
                    c_emb = tf.nn.embedding_lookup(
                        self.word_mat,
                        self.c_pr[:, i * config.para_limit:(i + 1) *
                                  config.para_limit])
                    q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

                c_emb = tf.concat([c_emb, ch_emb], axis=2)
                q_emb = tf.concat([q_emb, qh_emb], axis=2)

            with tf.variable_scope("encoding" + str(i)):
                rnn = gru(num_layers=3,
                          num_units=d,
                          batch_size=N,
                          input_size=c_emb.get_shape().as_list()[-1],
                          keep_prob=config.keep_prob,
                          is_train=self.is_train)
                c = rnn(c_emb, seq_len=self.c_len)
                q = rnn(q_emb, seq_len=self.q_len)

            with tf.variable_scope("attention" + str(i)):
                qc_att = dot_attention(c,
                                       q,
                                       mask=self.q_mask,
                                       hidden=d,
                                       keep_prob=config.keep_prob,
                                       is_train=self.is_train)
                rnn = gru(num_layers=1,
                          num_units=d,
                          batch_size=N,
                          input_size=qc_att.get_shape().as_list()[-1],
                          keep_prob=config.keep_prob,
                          is_train=self.is_train)
                att = rnn(qc_att, seq_len=self.c_len)
                # att is the v_P
                if i == 0:
                    att_vP = tf.identity(att)
                else:
                    att_vP = tf.concat([att_vP, att], axis=1)
                #att = tf.Print(att,[att],message="att:")
                print("att:", att.get_shape().as_list())
                print("att_vP:", att_vP.get_shape().as_list())
            #att_vP = tf.Print(att_vP,[tf.shape(att_vP)],message="att_vP:")
            """
			with tf.variable_scope("match"):
				self_att = dot_attention(
					att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train)
				rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape(
				).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
				match = rnn(self_att, seq_len=self.c_len)
			"""
        with tf.variable_scope("pointer"):

            # r_Q:
            init = summ(q[:, :, -2 * d:],
                        d,
                        mask=self.q_mask,
                        keep_prob=config.ptr_keep_prob,
                        is_train=self.is_train)
            print("rQ:", init.get_shape().as_list())
            pointer = ptr_net(batch=N,
                              hidden=init.get_shape().as_list()[-1],
                              keep_prob=config.ptr_keep_prob,
                              is_train=self.is_train)
            logits1, logits2 = pointer(init, att, d, self.c_mask)

        with tf.variable_scope("predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            #losses1_2 = tf.reduce_mean(losses1_2, axis=0)
            self.loss = tf.reduce_mean(losses + losses2)

            # print losses
            #condition = tf.greater(self.loss, 11)
            #self.yp1 = tf.where(condition, tf.Print(self.yp1,[self.yp1],message="Yp1:"), self.yp1)
            #self.yp2 = tf.where(condition, tf.Print(self.yp2,[self.yp2],message="Yp2:"), self.yp1)

        if config.with_passage_ranking:
            gi = None
            for i in range(config.max_para):
                # Passage ranking
                with tf.variable_scope("passage-ranking-attention" + str(i)):

                    #att_vP = tf.Print(att_vP,[att_vP.get_shape()],message="att_vP:")
                    vj_P = att_vP[:, i * config.para_limit:(i + 1) *
                                  config.para_limit, :]
                    pr_att = pr_attention(
                        batch=N,
                        hidden=init.get_shape().as_list()[-1],
                        keep_prob=config.keep_prob,
                        is_train=self.is_train)
                    r_P = pr_att(init, vj_P, d, self.c_mask)
                    #r_P = tf.Print(r_P,[r_P],message="r_p")
                    # Wg
                    concatenate = tf.concat([init, r_P], axis=1)
                    g = tf.nn.tanh(
                        dense(concatenate,
                              hidden=d,
                              use_bias=False,
                              scope="g" + str(i)))
                    g_ = dense(g, 1, use_bias=False, scope="g_" + str(i))
                    #g = tf.Print(g,[g],message="g")
                    if i == 0:
                        gi = tf.reshape(g_, [N, 1])
                    else:
                        gi = tf.concat([gi, tf.reshape(g_, [N, 1])], axis=1)
            #gi_ = tf.convert_to_tensor(gi,dtype=tf.float32)
            #self.gi = tf.nn.softmax(gi_)
            #self.losses3 = tf.nn.softmax_cross_entropy_with_logits(
            #			logits=gi_, labels=tf.reshape(self.pr,[-1,1]))
            self.losses3 = tf.nn.softmax_cross_entropy_with_logits(
                logits=gi, labels=self.pr)
            #self.losses3 = tf.Print(self.losses3,[self.losses3,tf.reduce_max(self.losses3),
            #	tf.reduce_max(self.pr),tf.reduce_max(gi)],message="losses3:")
            self.pr_loss = tf.reduce_mean(self.losses3)
            #self.pr_loss = tf.Print(self.pr_loss,[self.pr_loss])
            self.r = tf.constant(0.8)
            self.e_loss1 = tf.multiply(self.r, self.loss)
            self.e_loss2 = tf.multiply(tf.subtract(tf.constant(1.0), self.r),
                                       self.pr_loss)
            self.e_loss = tf.add(self.e_loss1, self.e_loss2)
def attention_bias_center(attn_bias, w_size, value=10.):
    bias_mask = tf.cast(tf.equal(attn_bias[0, 0], 0), attn_bias.dtype)
    centered_bias = tf.matrix_band_part(bias_mask, w_size - 1,
                                        w_size - 1) * value
    centered_bias = tf.expand_dims(tf.expand_dims(centered_bias, 0), 0)
    return centered_bias
Example #38
0
    def __init__(self, ind, y, U, m, B, lr, num_c):
        self.U = U
        self.m = m
        self.y = y.reshape([y.size, 1])
        self.ind = ind
        self.B = B
        self.learning_rate = lr
        self.nmod = len(self.U)
        self.r = self.U[0].shape[1]
        self.tf_U = [
            tf.Variable(self.U[k], dtype=tf.float32) for k in range(self.nmod)
        ]
        self.d = 0
        self.num_channels = num_c
        for k in range(self.nmod):
            self.d = self.d + self.U[k].shape[1]

        #init mu, L, Z
        Zinit = self.init_pseudo_inputs()
        self.tf_W = tf.reshape(tf.Variable(Zinit, dtype=tf.float32),
                               [self.m, self.r, self.nmod, 1])
        self.N = y.size

        #covariance of the noise
        self.U_covar_diag_tf = [
            tf.Variable(np.ones(self.U[k].shape[0] * self.U[k].shape[1]),
                        dtype=tf.float32) for k in range(self.nmod)
        ]
        U_covar_sqrt_mat_tf = [
            tf.linalg.diag(self.U_covar_diag_tf[k]) for k in range(self.nmod)
        ]

        #variational posterior
        self.tf_mu = tf.Variable(np.zeros([m, 1]), dtype=tf.float32)
        self.tf_L = tf.Variable(np.eye(m), dtype=tf.float32)
        #shallow kernel parameters
        self.tf_log_lengthscale = tf.Variable(0.0, dtype=tf.float32)
        self.tf_log_tau = tf.Variable(0.0, dtype=tf.float32)

        #Stochastic Variational ELBO
        #A mini-batch of observed entry indices
        self.tf_sub = tf.placeholder(tf.int32, shape=[None, self.nmod])
        self.tf_y = tf.placeholder(tf.float32, shape=[None, 1])

        #convolution variables and parameters
        self.conv0_f_shape = [2, 2, 1, self.num_channels]
        self.tf_conv0_w = tf.Variable(
            tf.truncated_normal(self.conv0_f_shape, stddev=0.03))
        self.tf_bias0 = tf.Variable(tf.truncated_normal([self.num_channels]))
        self.conv1_f_shape = [self.r, 1, self.num_channels, self.num_channels]

        self.tf_conv1_w = tf.Variable(
            tf.truncated_normal(self.conv1_f_shape, stddev=0.03))
        self.tf_bias1 = tf.Variable(tf.truncated_normal([self.num_channels]))

        self.conv2_f_shape = [
            1, self.nmod, self.num_channels, self.num_channels
        ]
        self.tf_conv2_w = tf.Variable(
            tf.truncated_normal(self.conv2_f_shape, stddev=0.03))
        self.tf_bias2 = tf.Variable(tf.truncated_normal([self.num_channels]))

        #compute convolutions for pseudo inputs
        self.tf_Z = self.compute_convs(self.tf_W)

        #compute convolutions and generate noise for batch
        tf_noise = [
            tf.matmul(
                0.1 * U_covar_sqrt_mat_tf[k],
                tf.random_normal(
                    shape=[self.U[k].shape[0] * self.U[k].shape[1], 1]))
            for k in range(self.nmod)
        ]
        tf_noise = [
            tf.reshape(tf_noise[k], self.U[k].shape) for k in range(self.nmod)
        ]

        tf_inputs = tf.concat([
            tf.gather((self.tf_U[k] + tf_noise[k]), self.tf_sub[:, k])
            for k in range(self.nmod)
        ], 1)

        tf_inputs = tf.reshape(tf_inputs, [-1, self.r, self.nmod, 1])
        tf_inputs = self.compute_convs(tf_inputs)

        Ltril = tf.matrix_band_part(self.tf_L, -1, 0)
        Kmm = self.kernel_matrix(self.tf_Z)
        Kmn = self.kernel_cross(self.tf_Z, tf_inputs)
        Knm = tf.transpose(Kmn)
        KnmKmmInv = tf.transpose(tf.matrix_solve(Kmm, Kmn))
        KnmKmmInvL = tf.matmul(KnmKmmInv, Ltril)
        tau = tf.exp(self.tf_log_tau)
        lengthscale = tf.exp(self.tf_log_lengthscale)
        hh_expt = tf.matmul(Ltril, tf.transpose(Ltril)) + tf.matmul(
            self.tf_mu, tf.transpose(self.tf_mu))
        ELBO = -0.5*tf.linalg.logdet(Kmm) - 0.5*tf.trace(tf.matrix_solve(Kmm, hh_expt)) + 0.5*tf.reduce_sum(tf.log(tf.pow(tf.diag_part(Ltril), 2))) \
                + 0.5*self.N*self.tf_log_tau - 0.5*tau*self.N/self.B*tf.reduce_sum(tf.pow(self.tf_y - tf.matmul(KnmKmmInv,self.tf_mu), 2)) \
                - 0.5*tau*( self.N*(1+jitter) - self.N/self.B*tf.reduce_sum(KnmKmmInv*Knm) + self.N/self.B*tf.reduce_sum(tf.pow(KnmKmmInvL,2)) ) \
                + 0.5*self.m - 0.5*self.N*tf.log(2.0*tf.constant(np.pi, dtype=tf.float32))  #\
        #- 0.5*tf.reduce_sum(tf.pow(self.tf_U[0],2)) - 0.5*tf.reduce_sum(tf.pow(self.tf_U[1],2)) - 0.5*tf.reduce_sum(tf.pow(self.tf_U[2],2))
        #- 0.5*tf.pow(tau,2) - 0.5*tf.pow(lengthscale, 2)

        #Add entropy of variational posterior to ELBO
        # This uses the property that the log det(A) = 2*sum(log(real(diag(C))))
        # where C is the cholesky decomposition of A. This allows us to avoid computing the cholesky decomposition
        for k in range(self.nmod):
            ELBO += 0.5*2.0 * math_ops.reduce_sum(
                 math_ops.log(math_ops.real(array_ops.matrix_diag_part(0.1*U_covar_sqrt_mat_tf[k]))),
                 axis=[-1])\
                + (self.U[k].shape[0]*self.U[k].shape[1])/2*(1+tf.log(2.0*tf.constant(np.pi, dtype=tf.float32)))

        self.loss = -ELBO
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.minimizer = self.optimizer.minimize(self.loss)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.sess.run(tf.global_variables_initializer())
Example #39
0
def causal_mask(length, neg_inf=-1e9, name=None):
    with tf.name_scope(name, default_name="causal_mask"):
        lower_triangle = tf.matrix_band_part(tf.ones([length, length]), -1, 0)
        ret = neg_inf * (1.0 - lower_triangle)
        return tf.reshape(ret, [1, 1, length, length])
Example #40
0
def uncertain_conditional(Xnew_mu,
                          Xnew_var,
                          feat,
                          kern,
                          q_mu,
                          q_sqrt,
                          full_cov_output=False,
                          full_cov=False,
                          whiten=False):
    """
    Calculates the conditional for uncertain inputs Xnew, p(Xnew) = N(Xnew_mu, Xnew_var).
    See ``conditional`` documentation for further reference.

    :param Xnew_mu: mean of the inputs, size N x Din
    :param Xnew_var: covariance matrix of the inputs, size N x Din x Din
    :param feat: gpflow.InducingFeature object, only InducingPoints is supported
    :param kern: gpflow kernel or ekernel object.
    :param q_mu: mean inducing points, size M x Dout
    :param q_sqrt: cholesky of the covariance matrix of the inducing points, size M x M x Dout
    :param full_cov_output: boolean wheter to compute covariance between output dimension.
                            Influences the shape of return value ``fvar``. Default is False
    :param whiten: boolean whether to whiten the representation. Default is False.

    :return fmean, fvar: mean and covariance of the conditional, size ``fmean`` is N x Dout,
            size ``fvar`` depends on ``full_cov_output``: if True ``f_var`` is N x Dout x Dout,
            if False then ``f_var`` is N x Dout
    """

    # TODO: Tensorflow 1.3 doesn't support broadcasting in``tf.matmul`` and
    # ``tf.matrix_triangular_solve``. This is reported in issue 216.
    # As a temporary workaround, we are using ``tf.einsum`` for the matrix
    # multiplications and tiling in the triangular solves.
    # The code that should be used once the bug is resolved is added in comments.

    if not isinstance(feat, InducingPoints):
        raise NotImplementedError

    if full_cov:
        # TODO: ``full_cov`` True would return a ``fvar`` of shape N x N x D x D,
        # encoding the covariance between input datapoints as well.
        # This is not implemented as this feature is only used for plotting purposes.
        raise NotImplementedError

    num_data = tf.shape(Xnew_mu)[0]  # number of new inputs (N)
    num_func = tf.shape(q_mu)[1]  # output dimension (D)

    q_sqrt_r = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1,
                                   0)  # D x M x M

    eKuf = tf.transpose(feat.eKfu(kern, Xnew_mu, Xnew_var))  # M x N
    Kuu = feat.Kuu(kern, jitter=settings.numerics.jitter_level)  # M x M
    Luu = tf.cholesky(Kuu)  # M x M

    if not whiten:
        q_mu = tf.matrix_triangular_solve(Luu, q_mu, lower=True)
        Luu_tiled = tf.tile(
            Luu[None, :, :],
            [num_func, 1, 1])  # remove line once issue 216 is fixed
        q_sqrt_r = tf.matrix_triangular_solve(Luu_tiled, q_sqrt_r, lower=True)

    Li_eKuf = tf.matrix_triangular_solve(Luu, eKuf, lower=True)  # M x N
    fmean = tf.matmul(Li_eKuf, q_mu, transpose_a=True)

    eKff = kern.eKdiag(Xnew_mu, Xnew_var)  # N
    eKuffu = feat.eKufKfu(kern, Xnew_mu, Xnew_var)  # N x M x M
    Luu_tiled = tf.tile(
        Luu[None, :, :],
        [num_data, 1, 1])  # remove this line, once issue 216 is fixed
    Li_eKuffu_Lit = tf.matrix_triangular_solve(Luu_tiled,
                                               tf.matrix_transpose(eKuffu),
                                               lower=True)
    Li_eKuffu_Lit = tf.matrix_triangular_solve(
        Luu_tiled, tf.matrix_transpose(Li_eKuffu_Lit), lower=True)  # N x M x M

    cov = tf.matmul(q_sqrt_r, q_sqrt_r, transpose_b=True)  # D x M x M

    if full_cov_output:
        fvar = (
            tf.matrix_diag(
                tf.tile(
                    (eKff - tf.trace(Li_eKuffu_Lit))[:, None], [1, num_func]))
            + tf.matrix_diag(tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov)) +
            # tf.matrix_diag(tf.trace(tf.matmul(Li_eKuffu_Lit, cov))) +
            tf.einsum("ig,nij,jh->ngh", q_mu, Li_eKuffu_Lit, q_mu) -
            # tf.matmul(q_mu, tf.matmul(Li_eKuffu_Lit, q_mu), transpose_a=True) -
            tf.matmul(fmean[:, :, None], fmean[:, :, None], transpose_b=True))
    else:
        fvar = ((eKff - tf.trace(Li_eKuffu_Lit))[:, None] +
                tf.einsum("nij,dji->nd", Li_eKuffu_Lit, cov) +
                tf.einsum("ig,nij,jg->ng", q_mu, Li_eKuffu_Lit, q_mu) -
                fmean**2)

    return fmean, fvar
Example #41
0
def get_mask(batch_size,sequence_length):
    lower_triangle=tf.matrix_band_part(tf.ones([sequence_length,sequence_length]),-1,0)
    result=-1e9*(1.0-lower_triangle)
    print("get_mask==>result:",result)
    return result
Example #42
0
    def multihead_attention(self, query, key, value, h=4, mask=False):
        W_query = tf.Variable(
            initial_value=tf.random_normal((self.hidden, self.hidden),
                                           stddev=1e-2),
            trainable=True,
            dtype=tf.float32,
        )
        W_key = tf.Variable(
            initial_value=tf.random_normal((self.hidden, self.hidden),
                                           stddev=1e-2),
            trainable=True,
            dtype=tf.float32,
        )
        W_value = tf.Variable(
            initial_value=tf.random_normal((self.hidden, self.hidden),
                                           stddev=1e-2),
            trainable=True,
            dtype=tf.float32,
        )
        W_output = tf.Variable(
            initial_value=tf.random_normal((self.hidden, self.hidden),
                                           stddev=1e-2),
            trainable=True,
            dtype=tf.float32,
        )
        multi_query = tf.concat(tf.unstack(tf.reshape(
            tf.matmul(tf.reshape(query, [-1, self.hidden]), W_query),
            [-1, 1, tf.shape(query)[1], h,
             int(self.hidden / h)]),
                                           axis=3),
                                axis=1)
        multi_key = tf.concat(tf.unstack(tf.reshape(
            tf.matmul(tf.reshape(key, [-1, self.hidden]), W_key),
            [-1, 1, tf.shape(key)[1], h,
             int(self.hidden / h)]),
                                         axis=3),
                              axis=1)
        multi_value = tf.concat(tf.unstack(tf.reshape(
            tf.matmul(tf.reshape(value, [-1, self.hidden]), W_value),
            [-1, 1, tf.shape(value)[1], h,
             int(self.hidden / h)]),
                                           axis=3),
                                axis=1)
        dotp = tf.matmul(multi_query, multi_key, transpose_b=True) / (tf.cast(
            tf.shape(multi_query)[-1], tf.float32)**0.5)
        attention_weights = tf.nn.softmax(dotp)

        if mask:
            attention_weights = tf.matrix_band_part(attention_weights, -1, 0)
            attention_weights /= tf.reduce_sum(attention_weights,
                                               axis=3,
                                               keep_dims=True)

        weighted_sum = tf.matmul(attention_weights, multi_value)
        weighted_sum = tf.concat(tf.unstack(weighted_sum, axis=1), axis=-1)

        multihead = tf.reshape(
            tf.matmul(tf.reshape(weighted_sum, [-1, self.hidden]), W_output),
            [-1, tf.shape(query)[1], self.hidden])
        output = multihead + query
        output = tf.contrib.layers.layer_norm(output, begin_norm_axis=2)
        return output, attention_weights
Example #43
0
lambda_phi = tf.Variable(lambda_phi_var, trainable=False, dtype=tf.float64)
lambda_pi_var = tf.Variable(lambda_pi_var, dtype=tf.float64)
lambda_beta_var = tf.Variable(lambda_beta_var, dtype=tf.float64)
lambda_nu_var = tf.Variable(lambda_nu_var, dtype=tf.float64)
lambda_m = tf.Variable(lambda_m_var, dtype=tf.float64)
lambda_w_var = tf.Variable(lambda_w_var, dtype=tf.float64)

# Maintain numerical stability
lambda_pi = tf.nn.softplus(lambda_pi_var)
lambda_beta = tf.nn.softplus(lambda_beta_var)
lambda_nu = tf.add(tf.nn.softplus(lambda_nu_var), tf.cast(D, dtype=tf.float64))

# Semidefinite positive matrices definition with Cholesky descomposition
mats = []
for k in range(K):
    aux1 = tf.matrix_set_diag(tf.matrix_band_part(lambda_w_var[k], -1, 0),
                              tf.nn.softplus(tf.diag_part(lambda_w_var[k])))
    mats.append(tf.matmul(aux1, aux1, transpose_b=True))
lambda_w = tf.convert_to_tensor(mats)

idx_tensor = tf.placeholder(tf.int32, shape=(BATCH_SIZE))

alpha_o = tf.convert_to_tensor(alpha_o, dtype=tf.float64)
nu_o = tf.convert_to_tensor(nu_o, dtype=tf.float64)
w_o = tf.convert_to_tensor(w_o, dtype=tf.float64)
m_o = tf.convert_to_tensor(m_o, dtype=tf.float64)
beta_o = tf.convert_to_tensor(beta_o, dtype=tf.float64)

# Evidence Lower Bound definition
e3 = tf.convert_to_tensor(0., dtype=tf.float64)
e2 = tf.convert_to_tensor(0., dtype=tf.float64)
Example #44
0
def make_tril_scale(
    loc=None,
    scale_tril=None,
    scale_diag=None,
    scale_identity_multiplier=None,
    shape_hint=None,
    validate_args=False,
    assert_positive=False,
    name=None):
  """Creates a LinearOperator representing a lower triangular matrix.

  Args:
    loc: Floating-point `Tensor`. This is used for inferring shape in the case
      where only `scale_identity_multiplier` is set.
    scale_tril: Floating-point `Tensor` representing the diagonal matrix.
      `scale_diag` has shape [N1, N2, ...  k, k], which represents a k x k
      lower triangular matrix.
      When `None` no `scale_tril` term is added to the LinearOperator.
      The upper triangular elements above the diagonal are ignored.
    scale_diag: Floating-point `Tensor` representing the diagonal matrix.
      `scale_diag` has shape [N1, N2, ...  k], which represents a k x k
      diagonal matrix.
      When `None` no diagonal term is added to the LinearOperator.
    scale_identity_multiplier: floating point rank 0 `Tensor` representing a
      scaling done to the identity matrix.
      When `scale_identity_multiplier = scale_diag = scale_tril = None` then
      `scale += IdentityMatrix`. Otherwise no scaled-identity-matrix is added
      to `scale`.
    shape_hint: scalar integer `Tensor` representing a hint at the dimension of
      the identity matrix when only `scale_identity_multiplier` is set.
    validate_args: Python `bool` indicating whether arguments should be
      checked for correctness.
    assert_positive: Python `bool` indicating whether LinearOperator should be
      checked for being positive definite.
    name: Python `str` name given to ops managed by this object.

  Returns:
    `LinearOperator` representing a lower triangular matrix.

  Raises:
    ValueError:  If only `scale_identity_multiplier` is set and `loc` and
      `shape_hint` are both None.
  """

  def _maybe_attach_assertion(x):
    if not validate_args:
      return x
    if assert_positive:
      return control_flow_ops.with_dependencies([
          tf.assert_positive(
              tf.matrix_diag_part(x), message="diagonal part must be positive"),
      ], x)
    return control_flow_ops.with_dependencies([
        tf.assert_none_equal(
            tf.matrix_diag_part(x),
            tf.zeros([], x.dtype),
            message="diagonal part must be non-zero"),
    ], x)

  with tf.name_scope(
      name,
      "make_tril_scale",
      values=[loc, scale_diag, scale_identity_multiplier]):

    loc = _convert_to_tensor(loc, name="loc")
    scale_tril = _convert_to_tensor(scale_tril, name="scale_tril")
    scale_diag = _convert_to_tensor(scale_diag, name="scale_diag")
    scale_identity_multiplier = _convert_to_tensor(
        scale_identity_multiplier,
        name="scale_identity_multiplier")

  if scale_tril is not None:
    scale_tril = tf.matrix_band_part(scale_tril, -1, 0)  # Zero out TriU.
    tril_diag = tf.matrix_diag_part(scale_tril)
    if scale_diag is not None:
      tril_diag += scale_diag
    if scale_identity_multiplier is not None:
      tril_diag += scale_identity_multiplier[..., tf.newaxis]

    scale_tril = tf.matrix_set_diag(scale_tril, tril_diag)

    return tf.linalg.LinearOperatorLowerTriangular(
        tril=_maybe_attach_assertion(scale_tril),
        is_non_singular=True,
        is_self_adjoint=False,
        is_positive_definite=assert_positive)

  return make_diag_scale(
      loc=loc,
      scale_diag=scale_diag,
      scale_identity_multiplier=scale_identity_multiplier,
      shape_hint=shape_hint,
      validate_args=validate_args,
      assert_positive=assert_positive,
      name=name)
import tensorflow as tf
"""tf.matrix_band_part(input,num_lower,num_upper,name=None)
功能:复制一个矩阵,并将规定带之外的元素置为0。
     假设元素坐标为(m,n),则in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
                                          (num_upper < 0 || (n-m) <= num_upper)。
    band(m,n)=in_band(m,n)*input(m,n)。
    特殊情况:
          tf.matrix_band_part(input, 0, -1) ==> 上三角阵.
          tf.matrix_band_part(input, -1, 0) ==> 下三角阵.
          tf.matrix_band_part(input, 0, 0) ==> 对角阵.
输入:num_lower:如果为负,则结果右上空三角阵;
     num_upper:如果为负,则结果左下为空三角阵。"""
a = tf.constant([[0, 1, 2, 3], [-1, 0, 1, 2], [-2, -1, 0, 1], [-3, -2, -1, 0]])
z = tf.matrix_band_part(a, 1, -1)  # 左下角空三角阵
# z==>[[0 1 2 3]
#      [-1 0 1 2]
#      [0 -1 0 1]
#      [0 0 -1 0]]
z1 = tf.matrix_band_part(a, 1, -2)  # 只要位置是负的话就行,与负数的数值无关
# [[ 0  1  2  3]
#  [-1  0  1  2]
#  [ 0 -1  0  1]
#  [ 0  0 -1  0]]
z2 = tf.matrix_band_part(a, -1, 1)  # 右上角空三角阵
# [[ 0  1  0  0]
#  [-1  0  1  0]
#  [-2 -1  0  1]
#  [-3 -2 -1  0]]
z3 = tf.matrix_band_part(a, 0, -1)
# [[0 1 2 3]
#  [0 0 1 2]
Example #46
0
def fully_correlated_conditional_repeat(Kmn, Kmm, Knn, f, *, full_cov=False, full_output_cov=False, q_sqrt=None,
                                        white=False):
    """
    This function handles conditioning of multi-output GPs in the case where the conditioning
    points are all fully correlated, in both the prior and posterior.

    Note: This conditional can handle 'repetitions' R, given in `f` and `q_sqrt`.

    :param Kmn: LM x N x P
    :param Kmm: LM x LM
    :param Knn: N x P or N x P x N x P
    :param f: data matrix, LM x R
    :param q_sqrt: R x LM x LM  or R x ML
    :param full_cov: calculate covariance between inputs
    :param full_output_cov: calculate covariance between outputs
    :param white: use whitened representation
    :return:
        - mean: R x N x P
        - variance: R x N x P, R x N x P x P, R x P x N x N, R x N x P x N x P
    """
    logger.debug("fully correlated conditional")
    R = tf.shape(f)[1]
    M, N, K = [tf.shape(Kmn)[i] for i in range(Kmn.shape.ndims)]
    Lm = tf.cholesky(Kmm)

    # Compute the projection matrix A
    # Lm: M x M    Kmn: M x NK
    Kmn = tf.reshape(Kmn, (M, N * K))  # M x NK
    A = tf.matrix_triangular_solve(Lm, Kmn, lower=True)  # M x NK
    Ar = tf.reshape(A, (M, N, K))

    # compute the covariance due to the conditioning
    if full_cov and full_output_cov:
        # fvar = Knn - tf.matmul(Ar, Ar, transpose_a=True)  # NK x NK, then reshape?
        fvar = Knn - tf.tensordot(Ar, Ar, [[0], [0]])  # N x K x N x K
    elif full_cov and not full_output_cov:
        At = tf.transpose(Ar)  # K x N x M
        fvar = Knn - tf.matmul(At, At, transpose_b=True)  # K x N x N
    elif not full_cov and full_output_cov:
        # This transpose is annoying
        At = tf.transpose(Ar, [1, 0, 2])  # N x M x K
        # fvar = Knn - tf.einsum('mnk,mnl->nkl', Ar, Ar)
        fvar = Knn - tf.matmul(At, At, transpose_a=True)  # N x K x K
    elif not full_cov and not full_output_cov:
        # Knn: N x K
        fvar = Knn - tf.reshape(tf.reduce_sum(tf.square(A), [0, 1]), (N, K))  # Can also do this with a matmul

    # another backsubstitution in the unwhitened case
    if not white:
        # A = tf.matrix_triangular_solve(tf.matrix_transpose(Lm), A, lower=False)  # M x NK
        raise NotImplementedError("Need to verify this.")  # pragma: no cover

    # f: M x R
    fmean = tf.matmul(f, A, transpose_a=True)  # R x M  *  M x NK  ->  R x NK
    fmean = tf.reshape(fmean, (R, N, K))  # R x N x K

    if q_sqrt is not None:
        Lf = tf.matrix_band_part(q_sqrt, -1, 0)  # R x M x M
        if q_sqrt.get_shape().ndims == 3:
            A_tiled = tf.tile(A[None, :, :], tf.stack([R, 1, 1]))  # R x M x NK
            LTA = tf.matmul(Lf, A_tiled, transpose_a=True)  # R x M x NK
        elif q_sqrt.get_shape().ndims == 2:  # pragma: no cover
            raise NotImplementedError("Does not support diagonal q_sqrt yet...")
        else:  # pragma: no cover
            raise ValueError("Bad dimension for q_sqrt: %s" %
                             str(q_sqrt.get_shape().ndims))

        if full_cov and full_output_cov:
            addvar = tf.matmul(LTA, LTA, transpose_a=True)  # R x NK x NK
            fvar = fvar[None, :, :, :, :] + tf.reshape(addvar, (R, N, K, N, K))
        elif full_cov and not full_output_cov:
            LTAr = tf.transpose(tf.reshape(LTA, [R, M, N, K]), [0, 3, 1, 2])  # R x K x M x N
            addvar = tf.matmul(LTAr, LTAr, transpose_a=True)  # R x K x N x N
            fvar = fvar[None, ...] + addvar  # R x K x N x N
        elif not full_cov and full_output_cov:
            LTAr = tf.transpose(tf.reshape(LTA, (R, M, N, K)), [0, 2, 3, 1])  # R x N x K x M
            fvar = fvar[None, ...] + tf.matmul(LTAr, LTAr, transpose_b=True)  # R x N x K x K
        elif not full_cov and not full_output_cov:
            addvar = tf.reshape(tf.reduce_sum(tf.square(LTA), axis=1), (R, N, K))  # R x N x K
            fvar = fvar[None, ...] + addvar  # R x N x K
    return fmean, fvar
Example #47
0
File: model.py Project: txye/QANet
    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(
                self.char_mat, self.ch), [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(
                self.char_mat, self.qh), [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

			# Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb, d,
                bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = None)
            qh_emb = conv(qh_emb, d,
                bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = True)

            ch_emb = tf.reduce_max(ch_emb, axis = 1)
            qh_emb = tf.reduce_max(qh_emb, axis = 1)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None)
            q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.c_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.c_len,
                scope = "Encoder_Residual_Block",
                bias = False,
                dropout = self.dropout)
            q = residual_block(q_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.q_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.q_len,
                scope = "Encoder_Residual_Block",
                reuse = True, # Share the weights between passage and question
                bias = False,
                dropout = self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask = mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis = -1)
            self.enc = [conv(inputs, d, name = "input_projection")]
            for i in range(3):
                if i % 2 == 0: # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                        num_blocks = 7,
                        num_conv_layers = 2,
                        kernel_size = 5,
                        mask = self.c_mask,
                        num_filters = d,
                        num_heads = nh,
                        seq_len = self.c_len,
                        scope = "Model_Encoder",
                        bias = False,
                        reuse = True if i > 0 else None,
                        dropout = self.dropout)
                    )

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, bias = False, name = "start_pointer"),-1)
            end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1, bias = False, name = "end_pointer"), -1)
            self.logits = [mask_logits(start_logits, mask = self.c_mask),
                           mask_logits(end_logits, mask = self.c_mask)]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits1, labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits2, labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var,v))
Example #48
0
def base_conditional(Kmn, Kmm, Knn, f, *, full_cov=False, q_sqrt=None, white=False):
    """
    Given a g1 and g2, and distribution p and q such that
      p(g2) = N(g2;0,Kmm)
      p(g1) = N(g1;0,Knn)
      p(g1|g2) = N(g1;0,Knm)
    And
      q(g2) = N(g2;f,q_sqrt*q_sqrt^T)
    This method computes the mean and (co)variance of
      q(g1) = \int q(g2) p(g1|g2)
    :param Kmn: M x N
    :param Kmm: M x M
    :param Knn: N x N  or  N
    :param f: M x R
    :param full_cov: bool
    :param q_sqrt: None or R x M x M (lower triangular)
    :param white: bool
    :return: N x R  or R x N x N
    """
    logger.debug("base conditional")
    # compute kernel stuff
    num_func = tf.shape(f)[1]  # R
    Lm = tf.cholesky(Kmm)

    # Compute the projection matrix A
    A = tf.matrix_triangular_solve(Lm, Kmn, lower=True)

    # compute the covariance due to the conditioning
    if full_cov:
        fvar = Knn - tf.matmul(A, A, transpose_a=True)
        fvar = tf.tile(fvar[None, :, :], [num_func, 1, 1])  # R x N x N
    else:
        fvar = Knn - tf.reduce_sum(tf.square(A), 0)
        fvar = tf.tile(fvar[None, :], [num_func, 1])  # R x N

    # another backsubstitution in the unwhitened case
    if not white:
        A = tf.matrix_triangular_solve(tf.transpose(Lm), A, lower=False)

    # construct the conditional mean
    fmean = tf.matmul(A, f, transpose_a=True)

    if q_sqrt is not None:
        if q_sqrt.get_shape().ndims == 2:
            LTA = A * tf.expand_dims(tf.transpose(q_sqrt), 2)  # R x M x N
        elif q_sqrt.get_shape().ndims == 3:
            L = tf.matrix_band_part(q_sqrt, -1, 0)  # R x M x M
            A_tiled = tf.tile(tf.expand_dims(A, 0), tf.stack([num_func, 1, 1]))
            LTA = tf.matmul(L, A_tiled, transpose_a=True)  # R x M x N
        else:  # pragma: no cover
            raise ValueError("Bad dimension for q_sqrt: %s" %
                             str(q_sqrt.get_shape().ndims))
        if full_cov:
            fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True)  # R x N x N
        else:
            fvar = fvar + tf.reduce_sum(tf.square(LTA), 1)  # R x N

    if not full_cov:
        fvar = tf.transpose(fvar)  # N x R

    return fmean, fvar  # N x R, R x N x N or N x R
Example #49
0
Kfu = kernelfx(x, xu)

Kff = kernelfx(x, x)

Kuuinv = tf.matrix_inverse(Kuu + offset * tf.eye(m, dtype=tf.float64))
KfuKuuinv = tf.matmul(Kfu, Kuuinv)
KffKuuinvU = [
    tf.reshape(
        tf.matmul(KfuKuuinv,
                  tf.expand_dims(tf.cast(u[i], dtype=tf.float64), axis=1)),
        [-1]) for i in range(0, p)
]

KffKuuKuf = tf.matmul(KfuKuuinv, Kfu, transpose_b=True)
sigmaf_temp = Kff - KffKuuKuf
sigmaf_diag = tf.matrix_band_part(sigmaf_temp, 0, 0)
sigmaf_upperT = tf.matrix_band_part(sigmaf_temp, 0, -1)
sigmaf = sigmaf_upperT + tf.transpose(sigmaf_upperT) - sigmaf_diag
f_scale = tf.cholesky(sigmaf + offset * tf.eye(M, dtype=tf.float64),
                      name='f_scale')

# p(F|U,X,Xu)
f = MultivariateNormalTriL(loc=tf.cast(KffKuuinvU, dtype=tf.float32),
                           scale_tril=tf.cast(f_scale, dtype=tf.float32),
                           name='pf')

# p(Y|F)
t_var_pre = tf.Variable(0.5 * np.ones((G, 1)), dtype=tf.float32)
t_var_full = tf.nn.softplus(t_var_pre)
idx_g = tf.placeholder(tf.int32, p)
t_var = tf.gather(t_var_full, idx_g)
Example #50
0
# Build model.
x = tf.placeholder(tf.float64, [data_num, 1], name='x')
z = tf.placeholder(tf.float64, [gen_num, z_dim], name='z')

g = generator(z, width=width, depth=depth, activation=activation,
              out_dim=out_dim)
v = tf.concat([x, g], 0)
VVT = tf.matmul(v, tf.transpose(v))
sqs = tf.reshape(tf.diag_part(VVT), [-1, 1])
sqs_tiled_horiz = tf.tile(sqs, tf.transpose(sqs).get_shape())
exp_object = sqs_tiled_horiz - 2 * VVT + tf.transpose(sqs_tiled_horiz)
K = tf.exp(-0.5 * (1 / sigma) * exp_object)
K_xx = K[:data_num, :data_num]
K_yy = K[data_num:, data_num:]
K_xy = K[:data_num, data_num:]
K_xx_upper = (tf.matrix_band_part(K_xx, 0, -1) - 
              tf.matrix_band_part(K_xx, 0, 0))
K_yy_upper = (tf.matrix_band_part(K_yy, 0, -1) - 
              tf.matrix_band_part(K_yy, 0, 0))
num_combos_xx = data_num * (data_num - 1) / 2
num_combos_yy = gen_num * (gen_num - 1) / 2
mmd = (tf.reduce_sum(K_xx_upper) / num_combos_xx +
       tf.reduce_sum(K_yy_upper) / num_combos_yy -
       2 * tf.reduce_sum(K_xy) / (data_num * gen_num))
g_vars = [var for var in tf.global_variables() if 'generator' in var.name]
if optimizer == 'adagrad':
    opt = tf.train.AdagradOptimizer(learning_rate)
elif optimizer == 'adam':
    opt = tf.train.AdamOptimizer(learning_rate)
elif optimizer == 'rmsprop':
    opt = tf.train.RMSPropOptimizer(learning_rate)
 def _random_pd_matrix(self, *shape):
     mat = rng.rand(*shape)
     chol = tfd.matrix_diag_transform(mat, transform=tf.nn.softplus)
     chol = tf.matrix_band_part(chol, -1, 0)
     return self.evaluate(tf.matmul(chol, chol, adjoint_b=True))
Example #52
0
def conditional(Kmn, Kmm, Knn, f, *, full_cov=False, q_sqrt=None, white=False):
    """
    Given a g1 and g2, and distribution p and q such that
      p(g2) = N(g2;0,Kmm)
      p(g1) = N(g1;0,Knn)
      p(g1|g2) = N(g1;0,Knm)
    And
      q(g2) = N(g2;f,q_sqrt*q_sqrt^T)
    This method computes the mean and (co)variance of
      q(g1) = \int q(g2) p(g1|g2)
    :param Kmn: P x M x N
    :param Kmm: M x M
    :param Knn: P x N x N  or P x N
    :param f: M x R
    :param full_cov: bool
    :param q_sqrt: R x M x M (lower triangular)
    :param white: bool
    :return: N x R  or R x N x N
    """
    logger.debug("base conditional")
    # compute kernel stuff
    num_func = tf.shape(f)[1]  # R

    Lm = tf.cholesky(Kmm)

    def solve_A(MN_Kmn):
        return tf.matrix_triangular_solve(Lm, MN_Kmn, lower=True) # M x M @ M x N -> M x N
    A = tf.map_fn(solve_A, Kmn) # P x M x N

    # compute the covariance due to the conditioning
    if full_cov:
        fvar = Knn - tf.tensordot(A, A, [[1], [1]]) # P x N x N
        fvar = tf.tile(fvar[None, :, :, :], [num_func, 1, 1, 1])  # R x N x N
    else:
        fvar = Knn - tf.reduce_sum(tf.square(A), 1) # P x N
        fvar = tf.tile(fvar[None, :, :], [num_func, 1, 1])  # R x P x N

    # another backsubstitution in the unwhitened case
    if not white:
        def backsub(MN_A):
            return tf.matrix_triangular_solve(tf.transpose(Lm), MN_A, lower=False)
        A = tf.map_fn(backsub, A) # P x M x N

    # construct the conditional mean
    fmean = tf.tensordot(A, f, [[1], [0]]) # P x N x R
    fmean = tf.transpose(fmean, [1, 0, 2]) # N x P x R

    if q_sqrt is not None:
        if q_sqrt.get_shape().ndims == 3:
            L = tf.matrix_band_part(q_sqrt, -1, 0)  # R x M x M

            # A: P x M x N
            LTA = tf.tensordot(L, A, [[1], [1]]) # R x M x P x N
        else:  # pragma: no cover
            raise ValueError("Bad dimension for q_sqrt: %s" %
                             str(q_sqrt.get_shape().ndims))
        if full_cov:
            fvar = fvar + tf.tensordot(LTA, LTA, [[1], [1]]) # R x P x N x N
        else:
            fvar = fvar + tf.reduce_sum(tf.square(LTA), 1) # R x P x N

    return fmean, fvar # N x P x R, R x P x N or R x P x N x N
Example #53
0
    def ready(self):
        config = self.config
        N, PL, QL, CL, d, dc, dg = config.batch_size, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.char_hidden
        gru = cudnn_gru if config.use_cudnn else native_gru

        with tf.variable_scope("emb"):
            with tf.variable_scope("char"):
                ch_emb = tf.reshape(tf.nn.embedding_lookup(
                    self.char_mat, self.ch), [N * PL, CL, dc])
                qh_emb = tf.reshape(tf.nn.embedding_lookup(
                    self.char_mat, self.qh), [N * QL, CL, dc])
                ch_emb = dropout(
                    ch_emb, keep_prob=config.keep_prob, is_train=self.is_train)
                qh_emb = dropout(
                    qh_emb, keep_prob=config.keep_prob, is_train=self.is_train)
                cell_fw = tf.contrib.rnn.GRUCell(dg)
                cell_bw = tf.contrib.rnn.GRUCell(dg)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, ch_emb, self.ch_len, dtype=tf.float32)
                ch_emb = tf.concat([state_fw, state_bw], axis=1)
                _, (state_fw, state_bw) = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw, cell_bw, qh_emb, self.qh_len, dtype=tf.float32)
                qh_emb = tf.concat([state_fw, state_bw], axis=1)
                qh_emb = tf.reshape(qh_emb, [N, QL, 2 * dg])
                ch_emb = tf.reshape(ch_emb, [N, PL, 2 * dg])

            with tf.name_scope("word"):
                c_emb = tf.nn.embedding_lookup(self.word_mat, self.c)
                q_emb = tf.nn.embedding_lookup(self.word_mat, self.q)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

        with tf.variable_scope("encoding"):
            rnn = gru(num_layers=3, num_units=d, batch_size=N, input_size=c_emb.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            c = rnn(c_emb, seq_len=self.c_len)
            q = rnn(q_emb, seq_len=self.q_len)

        with tf.variable_scope("attention"):
            qc_att = dot_attention(c, q, mask=self.q_mask, hidden=d,
                                   keep_prob=config.keep_prob, is_train=self.is_train)
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=qc_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            att = rnn(qc_att, seq_len=self.c_len)

        with tf.variable_scope("match"):
            self_att = dot_attention(
                att, att, mask=self.c_mask, hidden=d, keep_prob=config.keep_prob, is_train=self.is_train)
            rnn = gru(num_layers=1, num_units=d, batch_size=N, input_size=self_att.get_shape(
            ).as_list()[-1], keep_prob=config.keep_prob, is_train=self.is_train)
            match = rnn(self_att, seq_len=self.c_len)

        with tf.variable_scope("pointer"):
            init = summ(q[:, :, -2 * d:], d, mask=self.q_mask,
                        keep_prob=config.ptr_keep_prob, is_train=self.is_train)
            pointer = ptr_net(batch=N, hidden=init.get_shape().as_list(
            )[-1], keep_prob=config.ptr_keep_prob, is_train=self.is_train)
            logits1, logits2 = pointer(init, match, d, self.c_mask)

        with tf.variable_scope("predict"):
            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, 15)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits1, labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits2, labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)
Example #54
0
def mask_attn_weights(w):
    n = shape_list(w)[-1]
    b = tf.matrix_band_part(tf.ones([n, n]), -1, 0)
    b = tf.reshape(b, [1, 1, n, n])
    w = w * b + -1e9 * (1 - b)
    return w
Example #55
0
def independent_interdomain_conditional(Kmn, Kmm, Knn, f, *, full_cov=False, full_output_cov=False,
                                        q_sqrt=None, white=False):
    """
    The inducing outputs live in the g-space (R^L).
    Interdomain conditional calculation.

    :param Kmn: M x L x N x P
    :param Kmm: L x M x M
    :param Knn: N x P  or  N x N  or  P x N x N  or  N x P x N x P
    :param f: data matrix, M x L
    :param q_sqrt: L x M x M  or  M x L
    :param full_cov: calculate covariance between inputs
    :param full_output_cov: calculate covariance between outputs
    :param white: use whitened representation
    :return:
        - mean: N x P
        - variance: N x P, N x P x P, P x N x N, N x P x N x P
    """
    logger.debug("independent_interdomain_conditional")
    M, L, N, P = [tf.shape(Kmn)[i] for i in range(Kmn.shape.ndims)]

    Lm = tf.cholesky(Kmm)  # L x M x M

    # Compute the projection matrix A
    Kmn = tf.reshape(tf.transpose(Kmn, (1, 0, 2, 3)), (L, M, N * P))
    A = tf.matrix_triangular_solve(Lm, Kmn, lower=True)  # L x M x M  *  L x M x NP  ->  L x M x NP
    Ar = tf.reshape(A, (L, M, N, P))

    # compute the covariance due to the conditioning
    if full_cov and full_output_cov:
        fvar = Knn - tf.tensordot(Ar, Ar, [[0, 1], [0, 1]])  # N x P x N x P
    elif full_cov and not full_output_cov:
        At = tf.reshape(tf.transpose(Ar), (P, N, M * L))  # P x N x ML
        fvar = Knn - tf.matmul(At, At, transpose_b=True)  # P x N x N
    elif not full_cov and full_output_cov:
        At = tf.reshape(tf.transpose(Ar, [2, 3, 1, 0]), (N, P, M * L))  # N x P x ML
        fvar = Knn - tf.matmul(At, At, transpose_b=True)  # N x P x P
    elif not full_cov and not full_output_cov:
        fvar = Knn - tf.reshape(tf.reduce_sum(tf.square(A), [0, 1]), (N, P))  # Knn: N x P

    # another backsubstitution in the unwhitened case
    if not white:
        A = tf.matrix_triangular_solve(Lm, Ar)  # L x M x M  *  L x M x NP  ->  L x M x NP
        Ar = tf.reshape(A, (L, M, N, P))

    fmean = tf.tensordot(Ar, f, [[1, 0], [0, 1]])  # N x P

    if q_sqrt is not None:
        if q_sqrt.shape.ndims == 3:
            Lf = tf.matrix_band_part(q_sqrt, -1, 0)  # L x M x M
            LTA = tf.matmul(Lf, A, transpose_a=True)  # L x M x M  *  L x M x NP  ->  L x M x NP
        else:  # q_sqrt M x L
            LTA = (A * tf.transpose(q_sqrt)[..., None])  # L x M x NP

        if full_cov and full_output_cov:
            LTAr = tf.reshape(LTA, (L * M, N * P))
            fvar = fvar + tf.reshape(tf.matmul(LTAr, LTAr, transpose_a=True), (N, P, N, P))
        elif full_cov and not full_output_cov:
            LTAr = tf.transpose(tf.reshape(LTA, (L * M, N, P)), [2, 0, 1])  # P x LM x N
            fvar = fvar + tf.matmul(LTAr, LTAr, transpose_a=True)  # P x N x N
        elif not full_cov and full_output_cov:
            LTAr = tf.transpose(tf.reshape(LTA, (L * M, N, P)), [1, 0, 2])  # N x LM x P
            fvar = fvar + tf.matmul(LTAr, LTAr, transpose_a=True)  # N x P x P
        elif not full_cov and not full_output_cov:
            fvar = fvar + tf.reshape(tf.reduce_sum(tf.square(LTA), (0, 1)), (N, P))
    return fmean, fvar
Example #56
0
    def forward(self):
        PRTIN_ATT = 8

        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.ch),
                                [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(self.char_mat, self.qh),
                                [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=None)

            qh_emb = conv(qh_emb,
                          d,
                          bias=True,
                          activation=tf.nn.relu,
                          kernel_size=5,
                          name="char_conv",
                          reuse=True)

            ch_emb = tf.reduce_max(ch_emb, axis=1)
            qh_emb = tf.reduce_max(qh_emb, axis=1)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.c),
                1.0 - self.dropout)
            q_emb = tf.nn.dropout(
                tf.nn.embedding_lookup(self.word_mat, self.q),
                1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=None)
            q_emb = highway(q_emb,
                            size=d,
                            scope="highway",
                            dropout=self.dropout,
                            reuse=True)

        #we utilize the maximum length to control the upper bound of dataset...
        #we assume all the head using the same bucketing methods, but just learn to update the attention parts...
        self.soft_t5_alpha, self.soft_t5_beta = get_clip(nh,
                                                         A=1 /
                                                         config.fixed_c_maxlen,
                                                         config=config,
                                                         name='layer_c')

        with tf.variable_scope("Embedding_Encoder_Layer"):
            self.c_t5_bias = compute_bias(nh,
                                          config.para_limit,
                                          config.para_limit,
                                          self.soft_t5_alpha,
                                          self.soft_t5_beta,
                                          l1_width=config.l1_width,
                                          l2_width=config.l2_width,
                                          stddev=config.stddev,
                                          dropout_prob=self.dropout,
                                          activation=config.soft_t5_activation,
                                          bidirectional=True,
                                          name='layer_c')
            print('[!!!-c_t5_bias:]', self.c_t5_bias)
            '''
      @we add head mask for c_t5_bias
      '''
            head_mask = np.zeros((nh, config.para_limit, config.para_limit))
            #hidx=[7,2,3,0,6,1,4,5]
            low2high = [5, 4, 1, 6, 0, 3, 2, 7]
            for tt in range(PRTIN_ATT):
                head_mask[low2high[tt], :, :] = np.ones(
                    (config.para_limit, config.para_limit))

            self.c_t5_bias = self.c_t5_bias * head_mask

            self.c_layer_weights, c = residual_block(
                c_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.c_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.c_len,
                scope="Encoder_Residual_Block",
                bias=False,
                dropout=self.dropout,
                t5_bias=self.c_t5_bias[:, :, :self.c_maxlen, :self.c_maxlen])

            self.q_t5_bias = compute_bias(nh,
                                          config.ques_limit,
                                          config.ques_limit,
                                          self.soft_t5_alpha,
                                          self.soft_t5_beta,
                                          l1_width=config.l1_width,
                                          l2_width=config.l2_width,
                                          stddev=config.stddev,
                                          dropout_prob=self.dropout,
                                          activation=config.soft_t5_activation,
                                          bidirectional=True,
                                          name='layer_q')
            print('[!!!-q_t5_bias:]', self.q_t5_bias)

            head_mask = np.zeros((nh, config.ques_limit, config.ques_limit))
            #hidx=[7,0,6,2,4,1,3,5]
            low2high = [5, 3, 1, 4, 2, 6, 0, 7]
            for tt in range(PRTIN_ATT):
                head_mask[low2high[tt], :, :] = np.ones(
                    (config.ques_limit, config.ques_limit))

            self.q_t5_bias = self.q_t5_bias * head_mask

            #num_blocks = 1,
            self.q_layer_weights, q = residual_block(
                q_emb,
                num_blocks=1,
                num_conv_layers=4,
                kernel_size=7,
                mask=self.q_mask,
                num_filters=d,
                num_heads=nh,
                seq_len=self.q_len,
                scope="Encoder_Residual_Block",
                reuse=True,  # Share the weights between passage and question
                bias=False,
                dropout=self.dropout,
                t5_bias=self.q_t5_bias[:, :, :self.q_maxlen, :self.q_maxlen])

        #we need to revise this into multiple head attention~~
        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q],
                                                  self.c_maxlen,
                                                  self.q_maxlen,
                                                  input_keep_prob=1.0 -
                                                  self.dropout)
            mask_q = tf.expand_dims(self.q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask=mask_q))
            mask_c = tf.expand_dims(self.c_mask, 2)
            S_T = tf.transpose(
                tf.nn.softmax(mask_logits(S, mask=mask_c), dim=1), (0, 2, 1))
            self.c2q = tf.matmul(S_, q)
            self.q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, self.c2q, c * self.c2q, c * self.q2c]

        self.model_c_t5_bias_list = []
        self.model_c_layer_weights = []
        '''
    hidx_list = [[2,7,4,3,1,0,6,5],
                 [5,3,0,2,1,6,4,7],
                 [5,0,1,6,3,2,7,4]]'''

        hidx_list = [[5, 6, 0, 1, 3, 4, 7, 2], [7, 4, 6, 1, 2, 0, 3, 5],
                     [4, 7, 2, 3, 6, 1, 0, 5]]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis=-1)
            self.enc = [conv(inputs, d, name="input_projection")]
            for i in range(3):
                if i % 2 == 0:  # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i],
                                                1.0 - self.dropout)

                c_t5_bias = compute_bias(nh,
                                         config.para_limit,
                                         config.para_limit,
                                         self.soft_t5_alpha,
                                         self.soft_t5_beta,
                                         l1_width=config.l1_width,
                                         l2_width=config.l2_width,
                                         stddev=config.stddev,
                                         dropout_prob=self.dropout,
                                         activation=config.soft_t5_activation,
                                         bidirectional=True,
                                         name='model_layer_' + str(i))
                head_mask = np.zeros(
                    (nh, config.para_limit, config.para_limit))
                for tt in range(PRTIN_ATT):
                    head_mask[hidx_list[i][tt], :, :] = np.ones(
                        (config.para_limit, config.para_limit))

                c_t5_bias = c_t5_bias * head_mask

                self.model_c_t5_bias_list.append(c_t5_bias)
                print('[!!!-c_t5_bias:]', c_t5_bias)

                layer_weights, model_c = residual_block(
                    self.enc[i],
                    num_blocks=7,
                    num_conv_layers=2,
                    kernel_size=5,
                    mask=self.c_mask,
                    num_filters=d,
                    num_heads=nh,
                    seq_len=self.c_len,
                    scope="Model_Encoder",
                    bias=False,
                    reuse=True if i > 0 else None,
                    dropout=self.dropout,
                    t5_bias=self.model_c_t5_bias_list[i]
                    [:, :, :self.c_maxlen, :self.c_maxlen])
                self.model_c_layer_weights.append(layer_weights)
                self.enc.append(model_c)

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[2]], axis=-1),
                     1,
                     bias=False,
                     name="start_pointer"), -1)
            end_logits = tf.squeeze(
                conv(tf.concat([self.enc[1], self.enc[3]], axis=-1),
                     1,
                     bias=False,
                     name="end_pointer"), -1)
            self.logits = [
                mask_logits(start_logits, mask=self.c_mask),
                mask_logits(end_logits, mask=self.c_mask)
            ]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=logits1,
                                                             labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(logits=logits2,
                                                              labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)
            print('self.loss:', self.loss)
            print('self.yp1:', self.yp1)
        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(
                regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var, v))
Example #57
0
        codes.append(list(pair[1]))
print(codes)
codes = tf.constant(codes)  
labels = np.array([[0., 8., 8., 5., 8.],
 [8., 0., 8., 8., 5.],
 [8., 8., 0., 8., 8.],
 [5., 8., 8., 0., 8.],
 [8., 5., 8., 8., 0.]])
#loss = MSE_Loss(codes, labels, None)
sess = tf.Session()
#a = sess.run(loss)
bs = 5
k = 0
A1, A2 = tf.split(codes, [bs, bs*k])
    # Handle first part of loss
M1 = tf.matmul(A1, tf.transpose(A1))
diag = tf.squeeze(tf.matrix_diag_part(M1))
M2 = tf.stack([diag for i in range(bs)])
    # l2_mat_{i,j} = ||d_i - d_j||^2
l2_mat_1 = (M2 + tf.transpose(M2) - 2*M1)
loss_mat_1 = tf.matrix_band_part((l2_mat_1 - labels)**2, 0, -1)
loss_1 = tf.reduce_sum(loss_mat_1)

lis = sess.run([A1,A2,M1,diag,M2,l2_mat_1,loss_mat_1,loss_1])
print(lis)

grad = tf.gradients(loss_1, codes)
g = sess.run(grad)
print(g)
    
Example #58
0
def gauss_kl(q_mu, q_sqrt, K=None):
    """
    Compute the KL divergence KL[q || p] between

          q(x) = N(q_mu, q_sqrt^2)
    and
          p(x) = N(0, K)

    We assume N multiple independent distributions, given by the columns of
    q_mu and the last dimension of q_sqrt. Returns the sum of the divergences.

    q_mu is a matrix (M x N), each column contains a mean.

    q_sqrt can be a 3D tensor (M x M x N), each matrix within is a lower
        triangular square-root matrix of the covariance of q.
    q_sqrt can be a matrix (M x N), each column represents the diagonal of a
        square-root matrix of the covariance of q.

    K is a positive definite matrix (M x M): the covariance of p.
    If K is None, compute the KL divergence to p(x) = N(0, I) instead.
    """

    if K is None:
        white = True
        alpha = q_mu
    else:
        white = False
        Lp = tf.cholesky(K)
        alpha = tf.matrix_triangular_solve(Lp, q_mu, lower=True)

    if q_sqrt.get_shape().ndims == 2:
        diag = True
        num_latent = tf.shape(q_sqrt)[1]
        NM = tf.size(q_sqrt)
        Lq = Lq_diag = q_sqrt
    elif q_sqrt.get_shape().ndims == 3:
        diag = False
        num_latent = tf.shape(q_sqrt)[2]
        NM = tf.reduce_prod(tf.shape(q_sqrt)[1:])
        Lq = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1,
                                 0)  # force lower triangle
        Lq_diag = tf.matrix_diag_part(Lq)
    else:  # pragma: no cover
        raise ValueError("Bad dimension for q_sqrt: {}".format(
            q_sqrt.get_shape().ndims))

    # Mahalanobis term: μqᵀ Σp⁻¹ μq
    mahalanobis = tf.reduce_sum(tf.square(alpha))

    # Constant term: - N x M
    constant = -tf.cast(NM, settings.tf_float)

    # Log-determinant of the covariance of q(x):
    logdet_qcov = tf.reduce_sum(tf.log(tf.square(Lq_diag)))

    # Trace term: tr(Σp⁻¹ Σq)
    if white:
        trace = tf.reduce_sum(tf.square(Lq))
    else:
        if diag:
            M = tf.shape(Lp)[0]
            Lp_inv = tf.matrix_triangular_solve(
                Lp, tf.eye(M, dtype=settings.tf_float), lower=True)
            K_inv = tf.matrix_triangular_solve(tf.transpose(Lp),
                                               Lp_inv,
                                               lower=False)
            trace = tf.reduce_sum(
                tf.expand_dims(tf.matrix_diag_part(K_inv), 1) *
                tf.square(q_sqrt))
        else:
            Lp_tiled = tf.tile(tf.expand_dims(Lp, 0), [num_latent, 1, 1])
            LpiLq = tf.matrix_triangular_solve(Lp_tiled, Lq, lower=True)
            trace = tf.reduce_sum(tf.square(LpiLq))

    twoKL = mahalanobis + constant - logdet_qcov + trace

    # Log-determinant of the covariance of p(x):
    if not white:
        log_sqdiag_Lp = tf.log(tf.square(tf.matrix_diag_part(Lp)))
        sum_log_sqdiag_Lp = tf.reduce_sum(log_sqdiag_Lp)
        prior_logdet = tf.cast(num_latent,
                               settings.tf_float) * sum_log_sqdiag_Lp
        twoKL += prior_logdet

    return 0.5 * twoKL
def get_mask(batch_size,sequence_length):
    lower_triangle=tf.matrix_band_part(tf.ones([sequence_length,sequence_length]),-1,0)
    result=-1e9*(1.0-lower_triangle)
    print("get_mask==>result:",result)
    return result
Example #60
0
def conditional(Xnew, X, kern, f, full_cov=False, q_sqrt=None, whiten=False):
    """
    Given F, representing the GP at the points X, produce the mean and
    (co-)variance of the GP at the points Xnew.

    Additionally, there may be Gaussian uncertainty about F as represented by
    q_sqrt. In this case `f` represents the mean of the distribution and
    q_sqrt the square-root of the covariance.

    Additionally, the GP may have been centered (whitened) so that
        p(v) = N( 0, I)
        f = L v
    thus
        p(f) = N(0, LL^T) = N(0, K).
    In this case 'f' represents the values taken by v.

    The method can either return the diagonals of the covariance matrix for
    each output of the full covariance matrix (full_cov).

    We assume K independent GPs, represented by the columns of f (and the
    last dimension of q_sqrt).

     - Xnew is a data matrix, size N x D
     - X are data points, size M x D
     - kern is a GPflow kernel
     - f is a data matrix, M x K, representing the function values at X, for K functions.
     - q_sqrt (optional) is a matrix of standard-deviations or Cholesky
       matrices, size M x K or M x M x K
     - whiten (optional) is a boolean: whether to whiten the representation
       as described above.

    These functions are now considered deprecated, subsumed into this one:
        gp_predict
        gaussian_gp_predict
        gp_predict_whitened
        gaussian_gp_predict_whitened

    """

    # compute kernel stuff
    num_data = tf.shape(X)[0]  # M
    num_func = tf.shape(f)[1]  # K
    Kmn = kern.K(X, Xnew)
    Kmm = kern.K(X) + tf.eye(num_data, dtype=_settings.tf_float) * _settings.jitter_level
    Lm = tf.cholesky(Kmm)

    # Compute the projection matrix A
    A = tf.matrix_triangular_solve(Lm, Kmn, lower=True)

    # compute the covariance due to the conditioning
    if full_cov:
        fvar = kern.K(Xnew) - tf.matmul(A, A, transpose_a=True)
        shape = tf.stack([num_func, 1, 1])
    else:
        fvar = kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), 0)
        shape = tf.stack([num_func, 1])
    fvar = tf.tile(tf.expand_dims(fvar, 0), shape)  # K x N x N or K x N

    # another backsubstitution in the unwhitened case
    if not whiten:
        A = tf.matrix_triangular_solve(tf.transpose(Lm), A, lower=False)

    # construct the conditional mean
    fmean = tf.matmul(A, f, transpose_a=True)

    if q_sqrt is not None:
        if q_sqrt.get_shape().ndims == 2:
            LTA = A * tf.expand_dims(tf.transpose(q_sqrt), 2)  # K x M x N
        elif q_sqrt.get_shape().ndims == 3:
            L = tf.matrix_band_part(tf.transpose(q_sqrt, (2, 0, 1)), -1, 0)  # K x M x M
            A_tiled = tf.tile(tf.expand_dims(A, 0), tf.stack([num_func, 1, 1]))
            LTA = tf.matmul(L, A_tiled, transpose_a=True)  # K x M x N
        else:  # pragma: no cover
            raise ValueError("Bad dimension for q_sqrt: %s" %
                             str(q_sqrt.get_shape().ndims))
        if full_cov:
            fvar = fvar + tf.matmul(LTA, LTA, transpose_a=True)  # K x N x N
        else:
            fvar = fvar + tf.reduce_sum(tf.square(LTA), 1)  # K x N
    fvar = tf.transpose(fvar)  # N x K or N x N x K

    return fmean, fvar