Example #1
0
  def _overdetermined(op, grad):
    """Gradients for the overdetermined case of MatrixSolveLs.

    This is the backprop for the solution to the normal equations of the first
    kind:
       X = F(A, B) = (A^T * A + lambda * I)^{-1} * A^T * B
    which solve the least squares problem
       min ||A * X - B||_F^2 + lambda ||X||_F^2.
    """
    a = op.inputs[0]
    b = op.inputs[1]
    l2_regularizer = math_ops.cast(op.inputs[2], a.dtype.base_dtype)
    x = op.outputs[0]
    a_shape = array_ops.shape(a)
    batch_shape = a_shape[:-2]
    n = a_shape[-1]

    identity = linalg_ops.eye(n, batch_shape=batch_shape, dtype=a.dtype)
    gramian = math_ops.matmul(a, a, adjoint_a=True) + l2_regularizer * identity
    chol = linalg_ops.cholesky(gramian)
    # Temporary z = (A^T * A + lambda * I)^{-1} * grad.
    z = linalg_ops.cholesky_solve(chol, grad)
    xzt = math_ops.matmul(x, z, adjoint_b=True)
    zx_sym = xzt + array_ops.matrix_transpose(xzt)
    grad_a = -math_ops.matmul(a, zx_sym) + math_ops.matmul(b, z, adjoint_b=True)
    grad_b = math_ops.matmul(a, z)
    return (grad_a, grad_b, None)
Example #2
0
    def __call__(self, input_tuple, state, scope=None):
        with vs.variable_scope(scope or type(self).__name__):
            self._init_parameters()

            _input, target, dInput_dF = input_tuple
            
            u, s, r, inner_spikes, dW, dF, reward, reward_mean = state
            
            s = (1.0 - tau_syn) * s + tf.concat_v2([_input, inner_spikes], 1)
            u = (1.0 - tau_mem) * u + mo.matmul(s, self.W)
            r = (1.0 - tau_refr) * r
            

            act_raw = self._activation(u)
            act = act_raw * tf.exp(-r)
            
            spikes = tf.where(
                act > tf.random_uniform([batch_size, self._num_units]),
                tf.ones([batch_size, self._num_units]),
                tf.zeros([batch_size, self._num_units])
            )   
            
            hidden_spikes, _ = self.slice(spikes)
            _, act_visible = self.slice(act)
            
            
            reward_mean = (1.0 - tau_long) * reward_mean + tau_long * reward
            reward = (1.0 - tau_learn) * reward + tau_learn * tf.reduce_sum(
                target * safe_log(act_visible*dt) + (1.0 - target) * safe_log(1.0 - act_visible*dt)
            , 1)
            
            r += spikes * amp_refr

            act_grad = tf.gradients([act], [u])[0]
            
            learn_target = tf.concat_v2([hidden_spikes, target], 1)

            factor = tf.concat_v2([
                (reward - reward_mean) * tf.ones((batch_size, hidden_size,)), 
                tf.ones((batch_size, visible_size,))
            ], 1)
            
            neuron_derivative = tf.reduce_sum( (act_grad/act_raw) * (learn_target - act) * factor, 0)
            

            Wsliced = tf.slice(self.W, [0, 0], [self._filters_num, self._num_units])
            
            dF_deriv_part = tf.squeeze(mo.matmul(Wsliced, tf.expand_dims(neuron_derivative, 1)))
            
            
            dW += lrate * outer(
                tf.reduce_sum(s, 0),
                neuron_derivative
            )
            
            dInput_dF = tf.reduce_mean(dInput_dF, 0)
            
            dF += lrate * dF_deriv_part * dInput_dF
            
            return GLMOutputTuple(spikes, act, factor, reward, reward_mean), GLMStateTuple(u, s, r, spikes, dW, dF, reward, reward_mean)
Example #3
0
def power_sums_tensor(array_size, power_matrix, multiplier):
  r"""Computes \sum_{i=0}^{N-1} A^i B (A^i)^T for N=0..(array_size + 1).

  Args:
    array_size: The number of non-trivial sums to pre-compute.
    power_matrix: The "A" matrix above.
    multiplier: The "B" matrix above
  Returns:
    A Tensor with S[N] = \sum_{i=0}^{N-1} A^i B (A^i)^T
      S[0] is the zero matrix
      S[1] is B
      S[2] is A B A^T + B
      ...and so on
  """
  array_size = math_ops.cast(array_size, dtypes.int32)
  power_matrix = ops.convert_to_tensor(power_matrix)
  identity_like_power_matrix = linalg_ops.eye(
      array_ops.shape(power_matrix)[0], dtype=power_matrix.dtype)
  identity_like_power_matrix.set_shape(
      ops.convert_to_tensor(power_matrix).get_shape())
  transition_powers = functional_ops.scan(
      lambda previous_power, _: math_ops.matmul(previous_power, power_matrix),
      math_ops.range(array_size - 1),
      initializer=identity_like_power_matrix)
  summed = math_ops.cumsum(
      array_ops.concat([
          array_ops.expand_dims(multiplier, 0), math_ops.matmul(
              batch_times_matrix(transition_powers, multiplier),
              transition_powers,
              adjoint_b=True)
      ], 0))
  return array_ops.concat(
      [array_ops.expand_dims(array_ops.zeros_like(multiplier), 0), summed], 0)
Example #4
0
  def make_inverse_update_ops(self):
    """Create and return update ops corresponding to registered computations."""
    ops = []

    num_inverses = len(self._inverses_by_damping)
    matrix_power_registered = bool(self._matpower_by_exp_and_damping)
    use_eig = (
        self._eigendecomp or matrix_power_registered or
        num_inverses >= EIGENVALUE_DECOMPOSITION_THRESHOLD)

    if use_eig:
      self.register_eigendecomp()  # ensures self._eigendecomp is set
      eigenvalues, eigenvectors = self._eigendecomp  # pylint: disable=unpacking-non-sequence

      for damping, inv in self._inverses_by_damping.items():
        ops.append(
            inv.assign(
                math_ops.matmul(eigenvectors / (eigenvalues + damping),
                                array_ops.transpose(eigenvectors))))

      for (exp, damping), matpower in self._matpower_by_exp_and_damping.items():
        ops.append(
            matpower.assign(
                math_ops.matmul(eigenvectors *
                                (eigenvalues + damping)**exp,
                                array_ops.transpose(eigenvectors))))
      # These ops share computation and should be run on a single device.
      ops = [control_flow_ops.group(*ops)]
    else:
      for damping, inv in self._inverses_by_damping.items():
        ops.append(inv.assign(utils.posdef_inv(self._cov, damping)))

    return ops
  def testPotentialCycle(self):
    graph1 = ops.Graph()
    with graph1.as_default():
      a = constant_op.constant(1.0, shape=[2, 2])
      b = constant_op.constant(2.0, shape=[2, 2])
      matmul = math_ops.matmul(a, b)
      with ops.name_scope("hidden1"):
        c = nn_ops.relu(matmul)
        d = constant_op.constant(3.0, shape=[2, 2])
        matmul = math_ops.matmul(c, d)

    orig_meta_graph, _ = meta_graph.export_scoped_meta_graph(
        export_scope="hidden1", graph=graph1)

    graph2 = ops.Graph()
    with graph2.as_default():
      with self.assertRaisesRegexp(ValueError, "Graph contains unbound inputs"):
        meta_graph.import_scoped_meta_graph(
            orig_meta_graph, import_scope="new_hidden1")

      meta_graph.import_scoped_meta_graph(
          orig_meta_graph,
          import_scope="new_hidden1",
          input_map={
              "$unbound_inputs_MatMul": constant_op.constant(
                  4.0, shape=[2, 2])
          })
Example #6
0
  def make_inverse_update_ops(self):
    """Create and return update ops corresponding to registered computations."""
    ops = super(InverseProvidingFactor, self).make_inverse_update_ops()

    num_inverses = len(self._inverses_by_damping)
    matrix_power_registered = bool(self._matpower_by_exp_and_damping)
    use_eig = (self._eigendecomp or matrix_power_registered or
               num_inverses >= EIGENVALUE_DECOMPOSITION_THRESHOLD)

    if use_eig:
      self.register_eigendecomp()  # ensures self._eigendecomp is set
      eigenvalues, eigenvectors = self._eigendecomp  # pylint: disable=unpacking-non-sequence

      # The matrix self._cov is positive semidefinite by construction, but the
      # numerical eigenvalues could be negative due to numerical errors, so here
      # we clip them to be at least EIGENVALUE_CLIPPING_THRESHOLD.
      clipped_eigenvalues = math_ops.maximum(eigenvalues,
                                             EIGENVALUE_CLIPPING_THRESHOLD)

      for damping, inv in self._inverses_by_damping.items():
        ops.append(
            inv.assign(
                math_ops.matmul(eigenvectors / (clipped_eigenvalues + damping),
                                array_ops.transpose(eigenvectors))))

      for (exp, damping), matpower in self._matpower_by_exp_and_damping.items():
        ops.append(
            matpower.assign(
                math_ops.matmul(eigenvectors * (clipped_eigenvalues + damping)**
                                exp, array_ops.transpose(eigenvectors))))
    else:
      for damping, inv in self._inverses_by_damping.items():
        ops.append(inv.assign(utils.posdef_inv(self._cov, damping)))

    return ops
  def _testDefaultGraphInThread(self, constructed_event, continue_event, i):
    with session.Session() as s:
      self.assertEqual(ops.get_default_graph(), s.graph)
      a = constant_op.constant(1.0, shape=[1, 2])
      b = constant_op.constant(2.0, shape=[2, 3])
      c = math_ops.matmul(a, b)
      v = variables.Variable(c, name='var_%d' % i)

      # Block here until all threads have constructed their graph.
      constructed_event.set()
      continue_event.wait()

      assign_c_to_v = state_ops.assign(v, c)
      v.initializer.run()
      assign_c_to_v.eval()
      v_val = v.eval()
      self.assertAllEqual([[4.0, 4.0, 4.0]], v_val)
      d = constant_op.constant(3.0, shape=[2, 3])
      e = math_ops.matmul(a, d)
      assign_e_to_v = state_ops.assign(v, e)
      e_val = e.eval()
      self.assertAllEqual([[6.0, 6.0, 6.0]], e_val)
      v_val = v.eval()
      self.assertAllEqual([[4.0, 4.0, 4.0]], v_val)
      s.run(assign_e_to_v)
      v_val = v.eval()
      self.assertAllEqual([[6.0, 6.0, 6.0]], v_val)
      self.assertEqual(ops.get_default_graph(), s.graph)
Example #8
0
  def Test(self):
    if not use_static_shape_ or a_np_.dtype in (np.int32, np.int64, np.float16):
      self.skipTest("Skipping infeasible gradient test.")

    # Transpose and possibly conjugate a_np_ and b_np_ according to the
    # attributes such that tf.matmul(effective_a_np, effective_b_np, **kwargs)
    # results in a valid matrix multiplication and produces the same result as
    # np.matrix(a_np_) * np.matrix(b_np_)
    effective_a_np = _GetTransposedMatrices(a_np_, "a", kwargs_)
    effective_b_np = _GetTransposedMatrices(b_np_, "b", kwargs_)

    epsilon = np.finfo(a_np_.dtype).eps
    delta = epsilon**(1.0 / 3.0)
    tol = 20 * delta
    with self.session(), test_util.use_gpu():
      theoretical, numerical = gradient_checker_v2.compute_gradient(
          lambda x: math_ops.matmul(x, effective_b_np, **kwargs_),
          [effective_a_np],
          delta=delta)
      self.assertAllClose(theoretical, numerical, rtol=tol, atol=tol)

      theoretical, numerical = gradient_checker_v2.compute_gradient(
          lambda x: math_ops.matmul(effective_a_np, x, **kwargs_),
          [effective_b_np],
          delta=delta)
      self.assertAllClose(theoretical, numerical, rtol=tol, atol=tol)
Example #9
0
  def Test(self):
    np_val = np.matrix(a_np_) * np.matrix(b_np_)

    use_gpu = True
    if a_np_.dtype is np.float16 and (
        not test_util.CudaSupportsHalfMatMulAndConv()):
      use_gpu = False
      print("Built without fp16 matmul support for Cuda, running test on CPU.")

    # Transpose and possibly conjugate a_np_ and b_np_ according to the
    # attributes such that tf.matmul(effective_a_np, effective_b_np, **kwargs)
    # results in a valid matrix multiplication and produces the same result as
    # np.matrix(a_np_) * np.matrix(b_np_)
    effective_a_np = _GetTransposedMatrices(a_np_, "a", kwargs_)
    effective_b_np = _GetTransposedMatrices(b_np_, "b", kwargs_)
    with self.cached_session() as sess, test_util.device(use_gpu):
      if use_static_shape_:
        a = constant_op.constant(effective_a_np)
        b = constant_op.constant(effective_b_np)
        res = math_ops.matmul(a, b, **kwargs_)
        tf_val = self.evaluate(res)
      else:
        a = array_ops.placeholder(a_np_.dtype)
        b = array_ops.placeholder(b_np_.dtype)
        res = math_ops.matmul(a, b, **kwargs_)
        tf_val = sess.run(res, feed_dict={a: effective_a_np, b: effective_b_np})

    self.assertAllCloseAccordingToType(
        tf_val,
        np_val,
        float_rtol=2e-5,
        float_atol=2e-5,
        half_rtol=0.2,
        half_atol=0.2)
Example #10
0
  def sqrt_solve(self, x):
    """Computes `solve(self, x)`.

    Doesn't actually do the sqrt! Named as such to agree with API.

    To compute (M + V D V.T), we use the the Woodbury matrix identity:
      inv(M + V D V.T) = inv(M) - inv(M) V inv(C) V.T inv(M)
    where,
      C = inv(D) + V.T inv(M) V.
    See: https://en.wikipedia.org/wiki/Woodbury_matrix_identity

    Args:
      x: `Tensor`

    Returns:
      inv_of_self_times_x: `Tensor`
    """
    minv_x = linalg_ops.matrix_triangular_solve(self._m, x)
    vt_minv_x = math_ops.matmul(self._v, minv_x, transpose_a=True)
    cinv_vt_minv_x = linalg_ops.matrix_solve(
        self._woodbury_sandwiched_term(), vt_minv_x)
    v_cinv_vt_minv_x = math_ops.matmul(self._v, cinv_vt_minv_x)
    minv_v_cinv_vt_minv_x = linalg_ops.matrix_triangular_solve(
        self._m, v_cinv_vt_minv_x)
    return minv_x - minv_v_cinv_vt_minv_x
Example #11
0
  def testServerDefChanged(self):
    """Update server def, and run ops on new cluster."""
    context.set_server_def(
        server_def=get_server_def(
            ALT_JOB_NAME,
            local_server_port=0,
            remote_server_addresses=[
                self._cached_server1_target, self._cached_server2_target
            ],
            task_index=0))

    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % ALT_JOB_NAME):
      x1 = array_ops.ones([2, 2])
    y = math_ops.matmul(x1, x1)
    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())

    # Set the server def back to JOB_NAME
    context.set_server_def(
        server_def=get_server_def(
            JOB_NAME,
            local_server_port=0,
            remote_server_addresses=[
                self._cached_server1_target, self._cached_server2_target
            ],
            task_index=0))

    with ops.device("job:%s/replica:0/task:1/device:CPU:0" % JOB_NAME):
      x1 = array_ops.ones([2, 2])
    y = math_ops.matmul(x1, x1)
    np.testing.assert_array_equal([[2, 2], [2, 2]], y.numpy())
  def Test(self):
    np.random.seed(1)
    n = shape_[-1]
    batch_shape = shape_[:-2]
    np_dtype = dtype_.as_numpy_dtype
    a = np.random.uniform(
        low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
    if dtype_.is_complex:
      a += 1j * np.random.uniform(
          low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
    a += np.conj(a.T)
    a = np.tile(a, batch_shape + (1, 1))
    if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64):
      atol = 1e-4
    else:
      atol = 1e-12
    np_e, np_v = np.linalg.eigh(a)
    with self.test_session():
      if compute_v_:
        tf_e, tf_v = linalg_ops.self_adjoint_eig(constant_op.constant(a))

        # Check that V*diag(E)*V^T is close to A.
        a_ev = math_ops.matmul(
            math_ops.matmul(tf_v, array_ops.matrix_diag(tf_e)),
            tf_v,
            adjoint_b=True)
        self.assertAllClose(a_ev.eval(), a, atol=atol)

        # Compare to numpy.linalg.eigh.
        CompareEigenDecompositions(self, np_e, np_v,
                                   tf_e.eval(), tf_v.eval(), atol)
      else:
        tf_e = linalg_ops.self_adjoint_eigvals(constant_op.constant(a))
        self.assertAllClose(
            np.sort(np_e, -1), np.sort(tf_e.eval(), -1), atol=atol)
def _symmetric_matrix_square_root(mat, eps=1e-10):
  """Compute square root of a symmetric matrix.

  Note that this is different from an elementwise square root. We want to
  compute M' where M' = sqrt(mat) such that M' * M' = mat.

  Also note that this method **only** works for symmetric matrices.

  Args:
    mat: Matrix to take the square root of.
    eps: Small epsilon such that any element less than eps will not be square
      rooted to guard against numerical instability.

  Returns:
    Matrix square root of mat.
  """
  # Unlike numpy, tensorflow's return order is (s, u, v)
  s, u, v = linalg_ops.svd(mat)
  # sqrt is unstable around 0, just use 0 in such case
  si = array_ops.where(math_ops.less(s, eps), s, math_ops.sqrt(s))
  # Note that the v returned by Tensorflow is v = V
  # (when referencing the equation A = U S V^T)
  # This is unlike Numpy which returns v = V^T
  return math_ops.matmul(
      math_ops.matmul(u, array_ops.diag(si)), v, transpose_b=True)
Example #14
0
 def grad(dmm, dr):
   return [
       math_ops.matmul(dmm, b, transpose_b=True) +
       math_ops.matmul(array_ops.ones_like(b * dr), b, transpose_b=True),
       math_ops.matmul(a, dmm, transpose_b=True) +
       math_ops.matmul(a, array_ops.ones_like(a) * dr, transpose_b=True)
   ]
Example #15
0
def _MatrixInverseGrad(op, grad):
  """Gradient for MatrixInverse."""
  ainv = op.outputs[0]
  return -math_ops.matmul(
      ainv,
      math_ops.matmul(grad, ainv, transpose_b=True),
      transpose_a=True)
Example #16
0
  def test_lanczos_bidiag(self):
    np.random.seed(1)
    a_np = np.random.uniform(
        low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_)
    tol = 1e-12 if dtype_ == np.float64 else 1e-5

    with self.cached_session() as sess:
      if use_static_shape_:
        a = constant_op.constant(a_np)
      else:
        a = array_ops.placeholder(dtype_)
      operator = util.create_operator(a)
      lbd = lanczos.lanczos_bidiag(
          operator, steps_, orthogonalize=orthogonalize_)

      # The computed factorization should satisfy the equations
      #  A * V = U * B
      #  A' * U[:, :-1] = V * B[:-1, :]'
      av = math_ops.matmul(a, lbd.v)
      ub = lanczos.bidiag_matmul(lbd.u, lbd.alpha, lbd.beta, adjoint_b=False)
      atu = math_ops.matmul(a, lbd.u[:, :-1], adjoint_a=True)
      vbt = lanczos.bidiag_matmul(lbd.v, lbd.alpha, lbd.beta, adjoint_b=True)

      if use_static_shape_:
        av_val, ub_val, atu_val, vbt_val = sess.run([av, ub, atu, vbt])
      else:
        av_val, ub_val, atu_val, vbt_val = sess.run([av, ub, atu, vbt],
                                                    feed_dict={a: a_np})
      self.assertAllClose(av_val, ub_val, atol=tol, rtol=tol)
      self.assertAllClose(atu_val, vbt_val, atol=tol, rtol=tol)
  def benchmarkBatchMatMulBroadcast(self):
    for (a_shape, b_shape) in self.shape_pairs:
      with compat.forward_compatibility_horizon(2019, 4, 26):
        with ops.Graph().as_default(), \
            session.Session(config=benchmark.benchmark_config()) as sess, \
            ops.device("/cpu:0"):
          matrix_a = variables.Variable(
              GetRandomNormalInput(a_shape, np.float32))
          matrix_b = variables.Variable(
              GetRandomNormalInput(b_shape, np.float32))
          variables.global_variables_initializer().run()

          # Use batch matmul op's internal broadcasting.
          self.run_op_benchmark(
              sess,
              math_ops.matmul(matrix_a, matrix_b),
              min_iters=50,
              name="batch_matmul_cpu_{}_{}".format(a_shape, b_shape))

          # Manually broadcast the input matrices using the broadcast_to op.
          broadcasted_batch_shape = array_ops.broadcast_static_shape(
              matrix_a.shape[:-2], matrix_b.shape[:-2])
          broadcasted_a_shape = broadcasted_batch_shape.concatenate(
              matrix_a.shape[-2:])
          broadcasted_b_shape = broadcasted_batch_shape.concatenate(
              matrix_b.shape[-2:])
          self.run_op_benchmark(
              sess,
              math_ops.matmul(
                  array_ops.broadcast_to(matrix_a, broadcasted_a_shape),
                  array_ops.broadcast_to(matrix_b, broadcasted_b_shape)),
              min_iters=50,
              name="batch_matmul_manual_broadcast_cpu_{}_{}".format(
                  a_shape, b_shape))
def _sliced_wasserstein(a, b, random_sampling_count, random_projection_dim):
  """Compute the approximate sliced Wasserstein distance.

  Args:
      a: (matrix) Distribution "a" of samples (row, col).
      b: (matrix) Distribution "b" of samples (row, col).
      random_sampling_count: (int) Number of random projections to average.
      random_projection_dim: (int) Dimension of the random projection space.
  Returns:
      Float containing the approximate distance between "a" and "b".
  """
  s = array_ops.shape(a)
  means = []
  for _ in range(random_sampling_count):
    # Random projection matrix.
    proj = random_ops.random_normal(
        [array_ops.shape(a)[1], random_projection_dim])
    proj *= math_ops.rsqrt(
        math_ops.reduce_sum(math_ops.square(proj), 0, keepdims=True))
    # Project both distributions and sort them.
    proj_a = math_ops.matmul(a, proj)
    proj_b = math_ops.matmul(b, proj)
    proj_a = _sort_rows(proj_a, s[0])
    proj_b = _sort_rows(proj_b, s[0])
    # Pairwise Wasserstein distance.
    wdist = math_ops.reduce_mean(math_ops.abs(proj_a - proj_b))
    means.append(wdist)
  return math_ops.reduce_mean(means)
  def _sqrt_solve(self, rhs):
    # Recall the square root of this operator is M + VDV^T.
    # The Woodbury formula gives:
    # (M + VDV^T)^{-1}
    # = M^{-1} - M^{-1} V (D^{-1} + V^T M^{-1} V)^{-1} V^T M^{-1}
    # = M^{-1} - M^{-1} V C^{-1} V^T M^{-1}
    # where C is the capacitance matrix.
    # TODO(jvdillon) Determine if recursively applying rank-1 updates is more
    # efficient.  May not be possible because a general n x n matrix can be
    # represeneted as n rank-1 updates, and solving with this matrix is always
    # done in O(n^3) time.
    m = self._operator
    v = self._v
    cchol = self._chol_capacitance(batch_mode=False)

    # The operators will use batch/singleton mode automatically.  We don't
    # override.
    # M^{-1} rhs
    minv_rhs = m.solve(rhs)
    # V^T M^{-1} rhs
    vt_minv_rhs = math_ops.matmul(v, minv_rhs, transpose_a=True)
    # C^{-1} V^T M^{-1} rhs
    cinv_vt_minv_rhs = linalg_ops.cholesky_solve(cchol, vt_minv_rhs)
    # V C^{-1} V^T M^{-1} rhs
    v_cinv_vt_minv_rhs = math_ops.matmul(v, cinv_vt_minv_rhs)
    # M^{-1} V C^{-1} V^T M^{-1} rhs
    minv_v_cinv_vt_minv_rhs = m.solve(v_cinv_vt_minv_rhs)

    # M^{-1} - M^{-1} V C^{-1} V^T M^{-1}
    return minv_rhs - minv_v_cinv_vt_minv_rhs
Example #20
0
  def transition_power_noise_accumulator(self, num_steps):
    """Computes power sums in closed form."""
    def _pack_and_reshape(*values):
      return array_ops.reshape(
          array_ops.stack(axis=1, values=values),
          array_ops.concat(values=[array_ops.shape(num_steps), [2, 2]], axis=0))

    num_steps = math_ops.cast(num_steps, self.dtype)
    noise_transitions = num_steps - 1
    noise_transform = ops.convert_to_tensor(self.get_noise_transform(),
                                            self.dtype)
    noise_covariance_transformed = math_ops.matmul(
        math_ops.matmul(noise_transform,
                        self.state_transition_noise_covariance),
        noise_transform,
        adjoint_b=True)
    # Un-packing the transformed noise as:
    # [[a b]
    #  [c d]]
    a, b, c, d = array_ops.unstack(
        array_ops.reshape(noise_covariance_transformed, [-1, 4]), axis=1)
    sum_of_first_n = noise_transitions * (noise_transitions + 1) / 2
    sum_of_first_n_squares = sum_of_first_n * (2 * noise_transitions + 1) / 3
    return _pack_and_reshape(
        num_steps * a + sum_of_first_n * (b + c) + sum_of_first_n_squares * d,
        num_steps * b + sum_of_first_n * d,
        num_steps * c + sum_of_first_n * d,
        num_steps * d)
Example #21
0
  def _SparseMatMul(t1, t2, transpose_a=False, transpose_b=False):
    """Helper function to create SparseMatMul op."""

    assert t1 in is_sparse and t2 in is_sparse
    t1_sparse = is_sparse[t1]
    t2_sparse = is_sparse[t2]
    if not t1_sparse and not t2_sparse:
      return math_ops.matmul(t1, t2,
                             transpose_a=transpose_a,
                             transpose_b=transpose_b)
    transpose_out = False
    if not t1_sparse:
      transpose_out = True
      t1, t2 = t2, t1
      t1_sparse, t2_sparse = t2_sparse, t1_sparse
      assert t1_sparse
      transpose_a, transpose_b = not transpose_b, not transpose_a

    if transpose_b:
      t2 = array_ops.transpose(t2)
      transpose_b = False
    m = math_ops.matmul(t1, t2,
                        transpose_a=transpose_a,
                        transpose_b=transpose_b,
                        a_is_sparse=t1_sparse,
                        b_is_sparse=t2_sparse)
    if transpose_out:
      m = array_ops.transpose(m)
    return m
Example #22
0
def _QrGrad(op, dq, dr):
  """Gradient for Qr."""
  q, r = op.outputs
  if q.dtype.is_complex:
    raise NotImplementedError("QrGrad not implemented for dtype: %s" % q.dtype)
  if (r.shape.ndims is None or r.shape.as_list()[-2] is None or
      r.shape.as_list()[-1] is None):
    raise NotImplementedError("QrGrad not implemented with dynamic shapes.")
  if r.shape.dims[-2].value != r.shape.dims[-1].value:
    raise NotImplementedError("QrGrad not implemented when ncols > nrows "
                              "or full_matrices is true and ncols != nrows.")

  qdq = math_ops.matmul(q, dq, adjoint_a=True)
  qdq_ = qdq - _linalg.adjoint(qdq)
  rdr = math_ops.matmul(r, dr, adjoint_b=True)
  rdr_ = rdr - _linalg.adjoint(rdr)
  tril = array_ops.matrix_band_part(qdq_ + rdr_, -1, 0)

  def _TriangularSolve(x, r):
    """Equiv to matmul(x, adjoint(matrix_inverse(r))) if r is upper-tri."""
    return _linalg.adjoint(
        linalg_ops.matrix_triangular_solve(
            r, _linalg.adjoint(x), lower=False, adjoint=False))

  grad_a = math_ops.matmul(q, dr + _TriangularSolve(tril, r))
  grad_b = _TriangularSolve(dq - math_ops.matmul(q, qdq), r)
  return grad_a + grad_b
Example #23
0
def compute_cov(tensor, tensor_right=None, normalizer=None):
  """Compute the empirical second moment of the rows of a 2D Tensor.

  This function is meant to be applied to random matrices for which the true row
  mean is zero, so that the true second moment equals the true covariance.

  Args:
    tensor: A 2D Tensor.
    tensor_right: An optional 2D Tensor. If provided, this function computes
      the matrix product tensor^T * tensor_right instead of tensor^T * tensor.
    normalizer: optional scalar for the estimator (by default, the normalizer is
        the number of rows of tensor).

  Returns:
    A square 2D Tensor with as many rows/cols as the number of input columns.
  """
  if normalizer is None:
    normalizer = array_ops.shape(tensor)[0]
  if tensor_right is None:
    cov = (
        math_ops.matmul(tensor, tensor, transpose_a=True) / math_ops.cast(
            normalizer, tensor.dtype))
    return (cov + array_ops.transpose(cov)) / math_ops.cast(2.0, cov.dtype)
  else:
    return (math_ops.matmul(tensor, tensor_right, transpose_a=True) /
            math_ops.cast(normalizer, tensor.dtype))
 def testMinimizeSparseResourceVariable(self):
   for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
     with self.test_session():
       var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
       var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
       x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
       pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
       pred = math_ops.matmul(var0, x) + var1
       loss = pred * pred
       sgd_op = gradient_descent.GradientDescentOptimizer(1.0).minimize(loss)
       # TODO(apassos) calling initialize_resources on all resources here
       # doesn't work because the sessions and graph are reused across unit
       # tests and this would mean trying to reinitialize variables. Figure out
       # a long-term solution for this.
       resources.initialize_resources([var0, var1]).run()
       # Fetch params to validate initial values
       self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
       self.assertAllCloseAccordingToType([3.0], var1.eval())
       # Run 1 step of sgd
       sgd_op.run()
       # Validate updated params
       np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
       np_grad = 2 * np_pred
       self.assertAllCloseAccordingToType(
           [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
       self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
  def test_while_jacobian(self):
    x = random_ops.random_uniform([1, 3])
    y = random_ops.random_uniform([3, 3])

    # out = x @ y @ y @ y @ y, where @ is matmul operator.
    _, out = control_flow_ops.while_loop(
        lambda i, _: i < 4, lambda i, out: (i + 1, math_ops.matmul(out, y)),
        [0, x])

    def loop_fn(i):
      out_i = array_ops.gather(out, i, axis=1)
      return array_ops.reshape(gradient_ops.gradients(out_i, x)[0], [-1])

    out = pfor_control_flow_ops.pfor(loop_fn, iters=3)

    # The above code does not work with tf.while_loop instead of pfor. So we
    # manually compute the expected output here.
    # Note that gradient of output w.r.t is (y @ y @ y @ y)^T.
    expected_output = y
    for _ in range(3):
      expected_output = math_ops.matmul(expected_output, y)
    expected_output = array_ops.transpose(expected_output, [1, 0])

    with session.Session() as sess:
      out, expected = sess.run([out, expected_output])
      self.assertAllClose(expected, out)
    def runFiniteDifferences(self, shapes, dtypes=(dtypes_lib.float32, dtypes_lib.float64), scalarTest=False):
        with self.test_session(use_gpu=False):
            for shape in shapes:
                for batch in False, True:
                    for dtype in dtypes:
                        if not scalarTest:
                            x = constant_op.constant(np.random.randn(shape[0], shape[1]), dtype)
                            tensor = math_ops.matmul(x, array_ops.transpose(x)) / shape[0]
                        else:
                            # This is designed to be a faster test for larger matrices.
                            x = constant_op.constant(np.random.randn(), dtype)
                            R = constant_op.constant(np.random.randn(shape[0], shape[1]), dtype)
                            e = math_ops.mul(R, x)
                            tensor = math_ops.matmul(e, array_ops.transpose(e)) / shape[0]

                        # Inner-most matrices in tensor are positive definite.
                        if batch:
                            tensor = array_ops.tile(array_ops.expand_dims(tensor, 0), [4, 1, 1])
                        y = linalg_ops.cholesky(tensor)
                        if scalarTest:
                            y = math_ops.reduce_mean(y)
                        error = gradient_checker.compute_gradient_error(x, x._shape_as_list(), y, y._shape_as_list())
                        tf_logging.info("error = %f", error)
                        if dtype == dtypes_lib.float64:
                            self.assertLess(error, 1e-5)
                        else:
                            self.assertLess(error, 3e-3)
Example #27
0
 def test_mixing_eager_and_graph_tensors(self):
   with ops.Graph().as_default():
     x1 = array_ops.ones((3, 3))
   x2 = array_ops.ones((3, 3))
   self.assertIsInstance(x2, ops.EagerTensor)
   with self.assertRaisesRegexp(TypeError, 'Graph tensors'):
     math_ops.matmul(x1, x2)
Example #28
0
  def _project_input(self, inputs, c_prev, m_prev, with_c):
    """Fills in c_prev and m_prev with projected input, for input dimensions
    """
    conf = self._config

    if (inputs is not None and inputs.get_shape().with_rank(2)[1].value > 0
        and len(conf.inputs) > 0):
      if isinstance(inputs, tuple):
        if len(conf.inputs) != len(inputs):
          raise ValueError("Expect inputs as a tuple of {} "
                           "tensors".format(len(conf.inputs)))
        input_splits = inputs
      else:
        input_splits = array_ops.split(
          value=inputs, num_or_size_splits=len(conf.inputs), axis=1)
      input_sz = input_splits[0].get_shape().with_rank(2)[1].value

      for i, j in enumerate(conf.inputs):
        input_project_m = vs.get_variable(
          'project_m_{}'.format(j), [input_sz, conf.num_units],
          dtype=inputs.dtype)
        m_prev[j] = math_ops.matmul(input_splits[i], input_project_m)

        if with_c:
          input_project_c = vs.get_variable(
            'project_c_{}'.format(j), [input_sz, conf.num_units],
            dtype=inputs.dtype)
          c_prev[j] = math_ops.matmul(input_splits[i], input_project_c)
  def _batch_sqrt_solve(self, rhs):
    # Recall the square root of this operator is M + VDV^T.
    # The Woodbury formula gives:
    # (M + VDV^T)^{-1}
    # = M^{-1} - M^{-1} V (D^{-1} + V^T M^{-1} V)^{-1} V^T M^{-1}
    # = M^{-1} - M^{-1} V C^{-1} V^T M^{-1}
    # where C is the capacitance matrix.
    m = self._operator
    v = self._v
    cchol = self._chol_capacitance(batch_mode=True)

    # The operators will use batch/singleton mode automatically.  We don't
    # override.
    # M^{-1} rhs
    minv_rhs = m.solve(rhs)
    # V^T M^{-1} rhs
    vt_minv_rhs = math_ops.matmul(v, minv_rhs, adjoint_a=True)
    # C^{-1} V^T M^{-1} rhs
    cinv_vt_minv_rhs = linalg_ops.cholesky_solve(cchol, vt_minv_rhs)
    # V C^{-1} V^T M^{-1} rhs
    v_cinv_vt_minv_rhs = math_ops.matmul(v, cinv_vt_minv_rhs)
    # M^{-1} V C^{-1} V^T M^{-1} rhs
    minv_v_cinv_vt_minv_rhs = m.solve(v_cinv_vt_minv_rhs)

    # M^{-1} - M^{-1} V C^{-1} V^T M^{-1}
    return minv_rhs - minv_v_cinv_vt_minv_rhs
Example #30
0
  def _Underdetermined(op, grad):
    """Gradients for the underdetermined case of MatrixSolveLs.

    This is the backprop for the solution to the normal equations of the second
    kind:
      X = F(A, B) = A * (A*A^T + lambda*I)^{-1} * B
    that (for lambda=0) solve the least squares problem
      min ||X||_F subject to A*X = B.
    """
    a = op.inputs[0]
    b = op.inputs[1]
    l2_regularizer = math_ops.cast(op.inputs[2], a.dtype.base_dtype)
    # pylint: disable=protected-access
    chol = linalg_ops._RegularizedGramianCholesky(
        a, l2_regularizer=l2_regularizer, first_kind=False)
    # pylint: enable=protected-access
    grad_b = linalg_ops.cholesky_solve(chol, math_ops.matmul(a, grad))
    # Temporary tmp = (A * A^T + lambda * I)^{-1} * B.
    tmp = linalg_ops.cholesky_solve(chol, b)
    a1 = math_ops.matmul(tmp, a, adjoint_a=True)
    a1 = -math_ops.matmul(grad_b, a1)
    a2 = grad - math_ops.matmul(a, grad_b, adjoint_a=True)
    a2 = math_ops.matmul(tmp, a2, adjoint_b=True)
    grad_a = a1 + a2
    return (grad_a, grad_b, None)
Example #31
0
def _matmul_with_sharded_variable(tensor, sharded_tensor):
    """Multiply tensor with each tensor in sharded_tensor and column-concatenated"""
    return array_ops.concat(
        1, [math_ops.matmul(tensor, shard) for shard in sharded_tensor])
Example #32
0
 def jac_mul(tangent):
     flat_tangent = array_ops.reshape(tangent, shape=[-1])
     tangent_vector = array_ops.expand_dims(flat_tangent, 1)
     jvp_vector = math_ops.matmul(jac_fwd, tangent_vector)
     return array_ops.reshape(jvp_vector, tangent.shape)
Example #33
0
 def _verifyCholesky(self, x):
     # Verify that LL^T == x.
     with self.test_session() as sess:
         chol = linalg_ops.cholesky(x)
         verification = math_ops.matmul(chol, chol, adjoint_b=True)
         self._verifyCholeskyBase(sess, x, chol, verification)
def weighted_sum_from_feature_columns(columns_to_tensors,
                                      feature_columns,
                                      num_outputs,
                                      weight_collections=None,
                                      trainable=True,
                                      scope=None):
    """A tf.contrib.layers style linear prediction builder based on FeatureColumn.

  Generally a single example in training data is described with feature columns.
  This function generates weighted sum for each num_outputs. Weighted sum refers
  to logits in classification problems. It refers to prediction itself for
  linear regression problems.

  Example:

    ```
    # Building model for training
    feature_columns = (
        real_valued_column("my_feature1"),
        ...
    )
    columns_to_tensor = tf.parse_example(...)
    logits = weighted_sum_from_feature_columns(
        columns_to_tensors=columns_to_tensor,
        feature_columns=feature_columns,
        num_outputs=1)
    loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
                                                   logits=logits)
    ```

  Args:
    columns_to_tensors: A mapping from feature column to tensors. 'string' key
      means a base feature (not-transformed). It can have FeatureColumn as a
      key too. That means that FeatureColumn is already transformed by input
      pipeline. For example, `inflow` may have handled transformations.
    feature_columns: A set containing all the feature columns. All items in the
      set should be instances of classes derived from FeatureColumn.
    num_outputs: An integer specifying number of outputs. Default value is 1.
    weight_collections: List of graph collections to which weights are added.
    trainable: If `True` also add variables to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
    scope: Optional scope for variable_scope.

  Returns:
    A tuple containing:

      * A Tensor which represents predictions of a linear model.
      * A dictionary which maps feature_column to corresponding Variable.
      * A Variable which is used for bias.

  Raises:
    ValueError: if FeatureColumn cannot be used for linear predictions.
  """
    columns_to_tensors = columns_to_tensors.copy()
    check_feature_columns(feature_columns)
    with variable_scope.variable_scope(
            scope,
            default_name='weighted_sum_from_feature_columns',
            values=columns_to_tensors.values()):
        output_tensors = []
        column_to_variable = dict()
        transformer = _Transformer(columns_to_tensors)
        # pylint: disable=protected-access
        for column in sorted(set(feature_columns), key=lambda x: x.key):
            transformed_tensor = transformer.transform(column)
            try:
                embedding_lookup_arguments = column._wide_embedding_lookup_arguments(
                    transformed_tensor)
                variable, predictions = _create_embedding_lookup(
                    column, columns_to_tensors, embedding_lookup_arguments,
                    num_outputs, trainable, weight_collections)
            except NotImplementedError:
                with variable_scope.variable_scope(
                        None,
                        default_name=column.name,
                        values=columns_to_tensors.values()):
                    tensor = column._to_dense_tensor(transformed_tensor)
                    tensor = _maybe_reshape_input_tensor(tensor,
                                                         column.name,
                                                         output_rank=2)
                    variable = [
                        contrib_variables.model_variable(
                            name='weight',
                            shape=[tensor.get_shape()[1], num_outputs],
                            initializer=init_ops.zeros_initializer(),
                            trainable=trainable,
                            collections=weight_collections)
                    ]
                    predictions = math_ops.matmul(tensor,
                                                  variable[0],
                                                  name='matmul')
            except ValueError as ee:
                raise ValueError(
                    'Error creating weighted sum for column: {}.\n'
                    '{}'.format(column.name, ee))
            output_tensors.append(
                array_ops.reshape(predictions, shape=(-1, num_outputs)))
            column_to_variable[column] = variable
            _log_variable(variable)
            _maybe_restore_from_checkpoint(column._checkpoint_path(), variable)
        # pylint: enable=protected-access
        predictions_no_bias = math_ops.add_n(output_tensors)
        bias = contrib_variables.model_variable(
            'bias_weight',
            shape=[num_outputs],
            initializer=init_ops.zeros_initializer(),
            trainable=trainable,
            collections=_add_variable_collection(weight_collections))
        _log_variable(bias)
        predictions = nn_ops.bias_add(predictions_no_bias, bias)

        return predictions, column_to_variable, bias
Example #35
0
def npairs_loss(labels,
                embeddings_anchor,
                embeddings_positive,
                reg_lambda=3e-3,
                print_losses=False):
    """Computes the npairs loss.
          Npairs loss expects paired data where a pair is composed of samples from the
          same labels and each pairs in the minibatch have different labels. The loss
          has two components. The first component is the L2 regularizer on the
          embedding vectors. The second component is the sum of cross entropy loss
          which takes each row of the pair-wise similarity matrix as logits and
          the remapped one-hot labels as labels.
          See:
          http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf
          Args:
            labels: 1-D tf.int32 `Tensor` of shape [batch_size/2].
            embeddings_anchor: 2-D Tensor of shape [batch_size/2, embedding_dim] for the
              embedding vectors for the anchor images. Embeddings should not be
              l2 normalized.
            embeddings_positive: 2-D Tensor of shape [batch_size/2, embedding_dim] for the
              embedding vectors for the positive images. Embeddings should not be
              l2 normalized.
            reg_lambda: Float. L2 regularization term on the embedding vectors.
            print_losses: Boolean. Option to print the xent and l2loss.
          Returns:
            npairs_loss: tf.float32 scalar.
      """
    # pylint: enable=line-too-long
    # Add the regularizer on the embedding.
    reg_anchor = math_ops.reduce_mean(
        math_ops.reduce_sum(math_ops.square(embeddings_anchor), 1))
    reg_positive = math_ops.reduce_mean(
        math_ops.reduce_sum(math_ops.square(embeddings_positive), 1))
    l2loss = math_ops.multiply(0.25 * reg_lambda,
                               reg_anchor + reg_positive,
                               name='l2loss')

    # Get per pair similarities.
    similarity_matrix = math_ops.matmul(embeddings_anchor,
                                        embeddings_positive,
                                        transpose_a=False,
                                        transpose_b=True)

    # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
    lshape = array_ops.shape(labels)
    assert lshape.shape == 1
    labels = array_ops.reshape(labels, [lshape[0], 1])

    labels_remapped = math_ops.to_float(
        math_ops.equal(labels, array_ops.transpose(labels)))
    labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True)

    # Add the softmax loss.
    xent_loss = nn.softmax_cross_entropy_with_logits(logits=similarity_matrix,
                                                     labels=labels_remapped)
    xent_loss = math_ops.reduce_mean(xent_loss, name='xentropy')

    if print_losses:
        xent_loss = logging_ops.Print(
            xent_loss, ['cross entropy:', xent_loss, 'l2loss:', l2loss])

    return l2loss + xent_loss
Example #36
0
 def _covariance(self):
     x = self._variance_scale_term() * self._mean()
     return array_ops.matrix_set_diag(
         -math_ops.matmul(x[..., array_ops.newaxis],
                          x[..., array_ops.newaxis, :]),  # outer prod
         self._variance())
Example #37
0
 def lambda_fn(x):
     return math_ops.matmul(x[0], x[1])
def random_normal_correlated_columns(shape,
                                     mean=0.0,
                                     stddev=1.0,
                                     dtype=dtypes.float32,
                                     eps=1e-4,
                                     seed=None):
    """Batch matrix with (possibly complex) Gaussian entries and correlated cols.

  Returns random batch matrix `A` with specified element-wise `mean`, `stddev`,
  living close to an embedded hyperplane.

  Suppose `shape[-2:] = (M, N)`.

  If `M < N`, `A` is a random `M x N` [batch] matrix with iid Gaussian entries.

  If `M >= N`, then the colums of `A` will be made almost dependent as follows:

  ```
  L = random normal N x N-1 matrix, mean = 0, stddev = 1 / sqrt(N - 1)
  B = random normal M x N-1 matrix, mean = 0, stddev = stddev.

  G = (L B^H)^H, a random normal M x N matrix, living on N-1 dim hyperplane
  E = a random normal M x N matrix, mean = 0, stddev = eps
  mu = a constant M x N matrix, equal to the argument "mean"

  A = G + E + mu
  ```

  Args:
    shape:  Python list of integers.
      Shape of the returned tensor.  Must be at least length two.
    mean:  `Tensor` giving mean of normal to sample from.
    stddev:  `Tensor` giving stdev of normal to sample from.
    dtype:  `TensorFlow` `dtype` or numpy dtype
    eps:  Distance each column is perturbed from the low-dimensional subspace.
    seed:  Python integer seed for the RNG.

  Returns:
    `Tensor` with desired shape and dtype.

  Raises:
    ValueError:  If `shape` is not at least length 2.
  """
    dtype = dtypes.as_dtype(dtype)

    if len(shape) < 2:
        raise ValueError(
            "Argument shape must be at least length 2.  Found: %s" % shape)

    # Shape is the final shape, e.g. [..., M, N]
    shape = list(shape)
    batch_shape = shape[:-2]
    m, n = shape[-2:]

    # If there is only one column, "they" are by definition correlated.
    if n < 2 or n < m:
        return random_normal(shape,
                             mean=mean,
                             stddev=stddev,
                             dtype=dtype,
                             seed=seed)

    # Shape of the matrix with only n - 1 columns that we will embed in higher
    # dimensional space.
    smaller_shape = batch_shape + [m, n - 1]

    # Shape of the embedding matrix, mapping batch matrices
    # from [..., N-1, M] to [..., N, M]
    embedding_mat_shape = batch_shape + [n, n - 1]

    # This stddev for the embedding_mat ensures final result has correct stddev.
    stddev_mat = 1 / np.sqrt(n - 1)

    with ops.name_scope("random_normal_correlated_columns"):
        smaller_mat = random_normal(smaller_shape,
                                    mean=0.0,
                                    stddev=stddev_mat,
                                    dtype=dtype,
                                    seed=seed)

        if seed is not None:
            seed += 1287

        embedding_mat = random_normal(embedding_mat_shape,
                                      dtype=dtype,
                                      seed=seed)

        embedded_t = math_ops.matmul(embedding_mat,
                                     smaller_mat,
                                     transpose_b=True)
        embedded = array_ops.matrix_transpose(embedded_t)

        mean_mat = array_ops.ones_like(embedded) * mean

        return embedded + random_normal(shape, stddev=eps,
                                        dtype=dtype) + mean_mat
 def _matmul(self, x, adjoint=False, adjoint_arg=False):
     return math_ops.matmul(self._matrix,
                            x,
                            adjoint_a=adjoint,
                            adjoint_b=adjoint_arg)
Example #40
0
    def call(self, inputs, state):
        """Perform a step of attention-wrapped RNN.

    - Step 1: Mix the `inputs` and previous step's `attention` output via
      `cell_input_fn`.
    - Step 2: Call the wrapped `cell` with this input and its previous state.
    - Step 3: Score the cell's output with `attention_mechanism`.
    - Step 4: Calculate the alignments by passing the score through the
      `normalizer`.
    - Step 5: Calculate the context vector as the inner product between the
      alignments and the attention_mechanism's values (memory).
    - Step 6: Calculate the attention output by concatenating the cell output
      and context through the attention layer (a linear layer with
      `attention_size` outputs).

    Args:
      inputs: (Possibly nested tuple of) Tensor, the input at this time step.
      state: An instance of `AttentionWrapperState` containing
        tensors from the previous time step.

    Returns:
      A tuple `(attention_or_cell_output, next_state)`, where:

      - `attention_or_cell_output` depending on `output_attention`.
      - `next_state` is an instance of `DynamicAttentionWrapperState`
         containing the state calculated at this time step.
    """
        # Step 1: Calculate the true inputs to the cell based on the
        # previous attention value.

        output_prev_step = state.cell_state.h  # get hr_(i-1)

        attention_input = self._attention_input_fn(
            inputs,
            output_prev_step)  # get input to BahdanauAttention to get alpha_i
        alignments, raw_scores = self._attention_mechanism(
            attention_input, previous_alignments=state.alignments
        )  #alignments after softmax raw_scores(batch,sentence)

        expanded_alignments = array_ops.expand_dims(alignments, 1)

        attention_mechanism_values = self._attention_mechanism.values  #.value is encoder after multply w. means  wh
        context = math_ops.matmul(
            expanded_alignments,
            attention_mechanism_values)  #means how import per word in senctce
        context = array_ops.squeeze(context, [1])

        cell_inputs = self._cell_input_fn(
            inputs, context
        )  #concatenate input with alpha*memory and feed into root LSTM
        cell_state = state.cell_state
        cell_output, next_cell_state = self._cell(cell_inputs, cell_state)

        if self._attention_layer is not None:
            attention = self._attention_layer(
                array_ops.concat([cell_output, context], 1))
        else:
            attention = context

        if self._alignment_history:
            alignment_history = state.alignment_history.write(
                state.time, alignments)
        else:
            alignment_history = ()

        next_state = AttentionWrapperState(time=state.time + 1,
                                           cell_state=next_cell_state,
                                           attention=attention,
                                           alignments=alignments,
                                           alignment_history=alignment_history)

        if self._output_attention:
            return raw_scores, next_state
        else:
            return cell_output, next_state
Example #41
0
  def __call__(self, inputs, state, scope=None):
    """Run one step of GridRNN.
    Args:
      inputs: input Tensor, 2D, batch x input_size. Or None
      state: state Tensor, 2D, batch x state_size. Note that state_size = cell_state_size * recurrent_dims
      scope: VariableScope for the created subgraph; defaults to "GridRNNCell".
    Returns:
      A tuple containing:
      - A 2D, batch x output_size, Tensor representing the output of the cell
        after reading "inputs" when previous state was "state".
      - A 2D, batch x state_size, Tensor representing the new state of the cell
        after reading "inputs" when previous state was "state".
    """
    state_sz = state.get_shape().as_list()[1]
    if self.state_size != state_sz:
      raise ValueError('Actual state size not same as specified: {} vs {}.'.format(state_sz, self.state_size))

    conf = self._config
    dtype = inputs.dtype if inputs is not None else state.dtype

    # c_prev is `m`, and m_prev is `h` in the paper. Keep c and m here for consistency with the codebase
    c_prev = [None] * self._config.num_dims
    m_prev = [None] * self._config.num_dims
    cell_output_size = self._cell.state_size - conf.num_units

    # for LSTM   : state = memory cell + output, hence cell_output_size > 0
    # for GRU/RNN: state = output (whose size is equal to _num_units), hence cell_output_size = 0
    for recurrent_dim, start_idx in zip(self._config.recurrents, range(0, self.state_size, self._cell.state_size)):
      if cell_output_size > 0:
        c_prev[recurrent_dim] = array_ops.slice(state, [0, start_idx], [-1, conf.num_units])
        m_prev[recurrent_dim] = array_ops.slice(state, [0, start_idx + conf.num_units], [-1, cell_output_size])
      else:
        m_prev[recurrent_dim] = array_ops.slice(state, [0, start_idx], [-1, conf.num_units])

    new_output = [None] * conf.num_dims
    new_state = [None] * conf.num_dims

    with vs.variable_scope(scope or type(self).__name__):  # GridRNNCell

      # project input
      if inputs is not None and sum(inputs.get_shape().as_list()) > 0 and len(conf.inputs) > 0:
        input_splits = array_ops.split(1, len(conf.inputs), inputs)
        input_sz = input_splits[0].get_shape().as_list()[1]

        for i, j in enumerate(conf.inputs):
          input_project_m = vs.get_variable('project_m_{}'.format(j), [input_sz, conf.num_units], dtype=dtype)
          m_prev[j] = math_ops.matmul(input_splits[i], input_project_m)

          if cell_output_size > 0:
            input_project_c = vs.get_variable('project_c_{}'.format(j), [input_sz, conf.num_units], dtype=dtype)
            c_prev[j] = math_ops.matmul(input_splits[i], input_project_c)


      _propagate(conf.non_priority, conf, self._cell, c_prev, m_prev, new_output, new_state, True)
      _propagate(conf.priority, conf, self._cell, c_prev, m_prev, new_output, new_state, False)

      output_tensors = [new_output[i] for i in self._config.outputs]
      output = array_ops.zeros([0, 0], dtype) if len(output_tensors) == 0 else array_ops.concat(1,
                                                                                                       output_tensors)

      state_tensors = [new_state[i] for i in self._config.recurrents]
      states = array_ops.zeros([0, 0], dtype) if len(state_tensors) == 0 else array_ops.concat(1, state_tensors)

    return output, states
Example #42
0
 def test_mixing_numpy_arrays_and_graph_tensors(self):
     with ops.Graph().as_default():
         x1 = array_ops.ones((3, 3))
     x2 = np.ones((3, 3), dtype='float32')
     with self.assertRaisesRegexp(TypeError, 'Graph tensors'):
         math_ops.matmul(x1, x2)
Example #43
0
def linear(args,
           output_size,
           bias,
           handle=None,
           bias_initializer=None,
           kernel_initializer=None,
           kernel_name=_WEIGHTS_VARIABLE_NAME,
           bias_name=_BIAS_VARIABLE_NAME):
    """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.

    This function is originally copied from rnn_cell_impl.py and add
    the capability to deal with 3-D matrix.
    Args:
        args: a 2D/3D Tensor or a list of 2D/3D, batch x n, Tensors.
        output_size: int, second dimension of W[i].
        bias: boolean, whether to add a bias term or not.
        handle: A Tensor. If provided, use it
          as the weight matrix.
        bias_initializer: starting value to initialize the bias
          (default is all zeros).
        kernel_initializer: starting value to initialize the weight.

    Returns: A 2D/3D Tensor with shape [batch x output_size] equal to
        sum_i(args[i] * W[i]), where W[i]s are newly created matrices.

    Raises:
      ValueError: if some of the arguments has unspecified or wrong shape.
    """
    if args is None or (nest.is_sequence(args) and not args):
        raise ValueError("`args` must be specified")
    if not nest.is_sequence(args):  # added by Chengqi
        ## capable for 3D tensor
        shape = args.get_shape()
        if shape.ndims > 2:
            scope = vs.get_variable_scope()
            with vs.variable_scope(scope) as outer_scope:
                if handle is None:
                    weights = vs.get_variable(kernel_name,
                                              [shape[-1].value, output_size],
                                              dtype=args.dtype,
                                              initializer=kernel_initializer)
                else:
                    assert output_size == handle.get_shape().as_list()[-1], \
                        "ouput_size should be the same as the last dimension of handle tensor"
                    weights = handle

                res = math_ops.tensordot(args, weights,
                                         [[shape.ndims - 1], [0]])

                if not bias:
                    return res
                with vs.variable_scope(outer_scope) as inner_scope:
                    inner_scope.set_partitioner(None)
                    if bias_initializer is None:
                        bias_initializer = init_ops.constant_initializer(
                            0.0, dtype=args.dtype)
                    biases = vs.get_variable(bias_name, [output_size],
                                             dtype=args.dtype,
                                             initializer=bias_initializer)
                return nn_ops.bias_add(res, biases)
        else:
            args = [args]

    # Calculate the total size of arguments on dimension 1.
    total_arg_size = 0
    shapes = [a.get_shape() for a in args]
    for shape in shapes:
        if shape.ndims != 2:
            raise ValueError("linear is expecting 2D arguments: %s" % shapes)
        if shape[1].value is None:
            raise ValueError(
                "linear expects shape[1] to be provided for shape %s, "
                "but saw %s" % (shape, shape[1]))
        else:
            total_arg_size += shape[1].value

    dtype = [a.dtype for a in args][0]

    # Now the computation.
    scope = vs.get_variable_scope()
    with vs.variable_scope(scope) as outer_scope:
        if handle is None:
            weights = vs.get_variable(kernel_name,
                                      [total_arg_size, output_size],
                                      dtype=dtype,
                                      initializer=kernel_initializer)
        else:
            assert output_size == handle.get_shape().as_list()[-1], \
                "ouput_size should be the same as the last dimension of handle tensor"
            weights = handle
        if len(args) == 1:
            res = math_ops.matmul(args[0], weights)
        else:
            res = math_ops.matmul(array_ops.concat(args, 1), weights)
        if not bias:
            return res
        with vs.variable_scope(outer_scope) as inner_scope:
            inner_scope.set_partitioner(None)
            if bias_initializer is None:
                bias_initializer = init_ops.constant_initializer(0.0,
                                                                 dtype=dtype)
            biases = vs.get_variable(bias_name, [output_size],
                                     dtype=dtype,
                                     initializer=bias_initializer)
        return nn_ops.bias_add(res, biases)
Example #44
0
 def _benchmark_tf_matmul(self, m, transpose_b, num_iters):
     func = lambda: math_ops.matmul(m, m, transpose_b=transpose_b)
     self._run(func, num_iters)
Example #45
0
 def logsumexp_logit(embeddings):
     return math_ops.reduce_logsumexp(math_ops.matmul(embeddings,
                                                      reweighted_inputs,
                                                      transpose_b=True),
                                      axis=1,
                                      keep_dims=False)
Example #46
0
    def __call__(self, inputs, state, scope=None):
        """Run one step of LSTM.

        Args:
          inputs: input Tensor, 2D, batch x num_units.
          state: if `state_is_tuple` is False, this must be a state Tensor,
            `2-D, batch x state_size`.  If `state_is_tuple` is True, this must be a
            tuple of state Tensors, both `2-D`, with column sizes `c_state` and
            `m_state`.
          scope: VariableScope for the created subgraph; defaults to "LSTMCell".

        Returns:
          A tuple containing:
          - A `2-D, [batch x output_dim]`, Tensor representing the output of the
            LSTM after reading `inputs` when previous state was `state`.
            Here output_dim is:
               num_proj if num_proj was set,
               num_units otherwise.
          - Tensor(s) representing the new state of LSTM after reading `inputs` when
            the previous state was `state`.  Same type and shape(s) as `state`.

        Raises:
          ValueError: If input size cannot be inferred from inputs via
            static shape inference.
        """
        num_proj = self._num_units if self._num_proj is None else self._num_proj

        if self._state_is_tuple:
            (c_prev, m_prev) = state
        else:
            c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
            m_prev = array_ops.slice(state, [0, self._num_units],
                                     [-1, num_proj])

        dtype = inputs.dtype
        input_size = inputs.get_shape().with_rank(2)[1]
        if input_size.value is None:
            raise ValueError(
                "Could not infer input size from inputs.get_shape()[-1]")
        with vs.variable_scope(scope or type(self).__name__,
                               initializer=self._initializer):  # "LSTMCell"
            concat_w = _get_concat_variable(
                "W", [input_size.value + num_proj, 4 * self._num_units], dtype,
                self._num_unit_shards)

            b = vs.get_variable("B",
                                shape=[4 * self._num_units],
                                initializer=array_ops.zeros_initializer,
                                dtype=dtype)

            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
            cell_inputs = array_ops.concat(1, [inputs, m_prev])
            lstm_matrix = nn_ops.bias_add(
                math_ops.matmul(cell_inputs, concat_w), b)
            i, j, f, o = array_ops.split(1, 4, lstm_matrix)

            # Diagonal connections
            if self._use_peepholes:
                w_f_diag = vs.get_variable("W_F_diag",
                                           shape=[self._num_units],
                                           dtype=dtype)
                w_i_diag = vs.get_variable("W_I_diag",
                                           shape=[self._num_units],
                                           dtype=dtype)
                w_o_diag = vs.get_variable("W_O_diag",
                                           shape=[self._num_units],
                                           dtype=dtype)

            if self._use_peepholes:
                c = (sigmoid(f + self._forget_bias + w_f_diag * c_prev) *
                     c_prev +
                     sigmoid(i + w_i_diag * c_prev) * self._activation(j))
            else:
                c = (sigmoid(f + self._forget_bias) * c_prev +
                     sigmoid(i) * self._activation(j))

            if self._cell_clip is not None:
                # pylint: disable=invalid-unary-operand-type
                c = clip_ops.clip_by_value(c, -self._cell_clip,
                                           self._cell_clip)
                # pylint: enable=invalid-unary-operand-type

            if self._use_peepholes:
                m = sigmoid(o + w_o_diag * c) * self._activation(c)
            else:
                m = sigmoid(o) * self._activation(c)

            if self._num_proj is not None:
                concat_w_proj = _get_concat_variable(
                    "W_P", [self._num_units, self._num_proj], dtype,
                    self._num_proj_shards)

                m = math_ops.matmul(m, concat_w_proj)
                if self._proj_clip is not None:
                    # pylint: disable=invalid-unary-operand-type
                    m = clip_ops.clip_by_value(m, -self._proj_clip,
                                               self._proj_clip)
                    # pylint: enable=invalid-unary-operand-type

        new_state = (LSTMStateTuple(c, m)
                     if self._state_is_tuple else array_ops.concat(1, [c, m]))
        return m, new_state
 def overflow_in_float16():
   out = var * 2 ** 10
   out = math_ops.matmul(out, out)
   return array_ops.reshape(out, ())
Example #48
0
def linear(args,
           output_size,
           bias,
           bias_start=0.0,
           name='',
           scope=None,
           var_on_cpu=False,
           wd=0.0,
           squeeze=False,
           initializer=None,
           feat=None,
           state=None,
           drop_rate=1.0):
    """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.

    Args:
      args: a 2D Tensor or a list of 2D, batch x n, Tensors.
      output_size: int, second dimension of W[i].
      bias: boolean, whether to add a bias term or not.
      bias_start: starting value to initialize the bias; 0 by default.
      scope: VariableScope for the created subgraph; defaults to "Linear".

    Returns:
      A 2D Tensor with shape [batch x output_size] equal to
      sum_i(args[i] * W[i]), where W[i]s are newly created matrices.

    Raises:
      ValueError: if some of the arguments has unspecified or wrong shape.
    """
    if args is None or (isinstance(args, (list, tuple)) and not args):
        raise ValueError("`args` must be specified")
    if not isinstance(args, (list, tuple)):
        args = [args]

    # Calculate the total size of arguments on dimension 1.
    total_arg_size = 0
    shapes = [a.get_shape().as_list() for a in args]
    assert len(set(tuple(shape[:-1]) for shape in shapes)) <= 1

    new_shapes = [flatten(shape, 2) for shape in shapes]
    res_shape = shapes[0][:-1] + [output_size]
    args = [
        tf.reshape(arg, new_shape) for arg, new_shape in zip(args, new_shapes)
    ]

    for new_shape in new_shapes:
        if len(new_shape) != 2:
            raise ValueError("Linear is expecting 2D arguments: %s" %
                             str(shapes))
        if not new_shape[1]:
            raise ValueError("Linear expects shape[1] of arguments: %s" %
                             str(shapes))
        else:
            total_arg_size += new_shape[1]

    # This is for Dialog Match
    if feat is not None: total_arg_size -= 2
    if state is not None: total_arg_size += state.get_shape().as_list()[-1]

    # Now the computation.
    with vs.variable_scope(scope or "Linear"):
        if var_on_cpu:
            with tf.device("/cpu:0"):
                matrix = vs.get_variable("Matrix" + name,
                                         [total_arg_size, output_size],
                                         initializer=initializer)
        else:
            matrix = vs.get_variable("Matrix" + name,
                                     [total_arg_size, output_size],
                                     initializer=initializer)

        if wd:
            weight_decay = tf.mul(tf.nn.l2_loss(matrix),
                                  wd,
                                  name='weight_loss' + name)
            tf.add_to_collection('losses', weight_decay)

# Modify for Feature Matching
        if feat is not None:
            W = vs.get_variable("Embed" + name,
                                [total_arg_size + 2, total_arg_size + 2],
                                initializer=initializer)
            args_ = tf.transpose(math_ops.matmul(args[0], W))  # [D * N]
            h1 = tf.transpose(tf.slice(args_, [0, 0], [total_arg_size, -1]))
            h2 = tf.slice(args_, [total_arg_size, 0], [2, -1])
            f = tf.cast(tf.transpose(feat, [1, 0, 2]),
                        'float32')  # [2 * N * A]

            res = math_ops.matmul(h1, matrix)  # [N * A]
            for i in range(2):
                h2_ = tf.gather(h2, i)  # [1 * N]
                f_ = tf.gather(f, i)  # [1 * N * A]
                res += tf.transpose(tf.mul(h2_, tf.transpose(f_)))
        elif state is not None:
            h = tf.concat(1, [args[0], tf.cast(state, 'float32')])  # [D+A]
            res = math_ops.matmul(h, matrix)  # [N * A]
        else:
            if len(args) == 1:
                res = math_ops.matmul(args[0], matrix)
            else:
                res = math_ops.matmul(array_ops.concat(1, args), matrix)

        if not bias:
            res = tf.reshape(res, res_shape, name='out' + name)
            if squeeze:
                res = tf.squeeze(res, squeeze_dims=[len(res_shape) - 1])
            return res
        bias_term = vs.get_variable(
            "Bias" + name, [output_size],
            initializer=init_ops.constant_initializer(bias_start))
        res = res + bias_term
        res = tf.reshape(res, res_shape, name='out' + name)
        if squeeze:
            res = tf.squeeze(res, squeeze_dims=[len(res_shape) - 1])
    return res
Example #49
0
 def GraphFn(self, inp, inp1):
     x1 = math_ops.matmul(inp, inp1, name="matmul")
     # Relu to reach minimum segment size.
     x1 = nn.relu(x1, name="relu")
     return array_ops.identity(x1, name="output_0")
Example #50
0
    def _testScopedExport(self, test_dir, exported_filenames):
        graph = ops.Graph()
        with graph.as_default():
            # Creates an inference graph.
            # Hidden 1
            colocate_constraint = constant_op.constant(1.2, name="constraint")
            images = constant_op.constant(1.2,
                                          dtypes.float32,
                                          shape=[100, 28],
                                          name="images")
            with ops.name_scope("hidden1"):
                with graph.colocate_with(colocate_constraint.op):
                    weights1 = variables.Variable(random_ops.truncated_normal(
                        [28, 128], stddev=1.0 / math.sqrt(float(28))),
                                                  name="weights")
                # The use of control_flow_ops.cond here is purely for adding test
                # coverage the save and restore of control flow context (which doesn't
                # make any sense here from a machine learning perspective).  The typical
                # biases is a simple Variable without the conditions.
                biases1 = variables.Variable(control_flow_ops.cond(
                    math_ops.less(random.random(),
                                  0.5), lambda: array_ops.ones([128]),
                    lambda: array_ops.zeros([128])),
                                             name="biases")
                hidden1 = nn_ops.relu(
                    math_ops.matmul(images, weights1) + biases1)

            # Hidden 2
            with ops.name_scope("hidden2"):
                weights2 = variables.Variable(random_ops.truncated_normal(
                    [128, 32], stddev=1.0 / math.sqrt(float(128))),
                                              name="weights")

                # The use of control_flow_ops.while_loop here is purely for adding test
                # coverage the save and restore of control flow context (which doesn't
                # make any sense here from a machine learning perspective).  The typical
                # biases is a simple Variable without the conditions.
                def loop_cond(it, _):
                    return it < 2

                def loop_body(it, biases2):
                    biases2 += constant_op.constant(0.1, shape=[32])
                    return it + 1, biases2

                _, biases2 = control_flow_ops.while_loop(
                    loop_cond, loop_body, [
                        constant_op.constant(0),
                        variables.Variable(array_ops.zeros([32]),
                                           name="biases")
                    ])
                hidden2 = nn_ops.relu(
                    math_ops.matmul(hidden1, weights2) + biases2)
            # Linear
            with ops.name_scope("softmax_linear"):
                weights3 = variables.Variable(random_ops.truncated_normal(
                    [32, 10], stddev=1.0 / math.sqrt(float(32))),
                                              name="weights")
                biases3 = variables.Variable(array_ops.zeros([10]),
                                             name="biases")
                logits = math_ops.matmul(hidden2, weights3) + biases3
                ops.add_to_collection("logits", logits)

            # Exports each sub-graph.
            # Exports the first one with unbound_inputs_col_name set to default.
            orig_meta_graph1, var_list = meta_graph.export_scoped_meta_graph(
                filename=os.path.join(test_dir, exported_filenames[0]),
                graph=ops.get_default_graph(),
                export_scope="hidden1")
            self.assertEqual(["biases:0", "weights:0"],
                             sorted(var_list.keys()))
            var_names = [v.name for _, v in var_list.items()]
            self.assertEqual(["hidden1/biases:0", "hidden1/weights:0"],
                             sorted(var_names))

            # Exports the rest with no unbound_inputs_col_name.
            orig_meta_graph2, _ = meta_graph.export_scoped_meta_graph(
                filename=os.path.join(test_dir, exported_filenames[1]),
                graph=ops.get_default_graph(),
                export_scope="hidden2",
                unbound_inputs_col_name=None)
            orig_meta_graph3, _ = meta_graph.export_scoped_meta_graph(
                filename=os.path.join(test_dir, exported_filenames[2]),
                graph=ops.get_default_graph(),
                export_scope="softmax_linear",
                unbound_inputs_col_name=None)

        return [orig_meta_graph1, orig_meta_graph2, orig_meta_graph3]
Example #51
0
def linear(args,
           output_size,
           bias,
           bias_initializer=None,
           kernel_initializer=None,
           kernel_regularizer=None,
           bias_regularizer=None,
           normalize=False):
    """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
  Args:
    args: a 2D Tensor or a list of 2D, batch x n, Tensors.
    output_size: int, second dimension of W[i].
    bias: boolean, whether to add a bias term or not.
    bias_initializer: starting value to initialize the bias
      (default is all zeros).
    kernel_initializer: starting value to initialize the weight.
    kernel_regularizer: kernel regularizer
    bias_regularizer: bias regularizer
  Returns:
    A 2D Tensor with shape [batch x output_size] equal to
    sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
  Raises:
    ValueError: if some of the arguments has unspecified or wrong shape.
  """
    if args is None or (nest.is_sequence(args) and not args):
        raise ValueError("`args` must be specified")
    if not nest.is_sequence(args):
        args = [args]

    # Calculate the total size of arguments on dimension 1.
    total_arg_size = 0
    shapes = [a.get_shape() for a in args]
    for shape in shapes:
        if shape.ndims != 2:
            raise ValueError("linear is expecting 2D arguments: %s" % shapes)
        if shape[1].value is None:
            raise ValueError(
                "linear expects shape[1] to be provided for shape %s, "
                "but saw %s" % (shape, shape[1]))
        else:
            total_arg_size += shape[1].value

    dtype = [a.dtype for a in args][0]

    # Now the computation.
    scope = vs.get_variable_scope()
    with vs.variable_scope(scope) as outer_scope:
        weights = vs.get_variable(_WEIGHTS_VARIABLE_NAME,
                                  [total_arg_size, output_size],
                                  dtype=dtype,
                                  initializer=kernel_initializer,
                                  regularizer=kernel_regularizer)

        if len(args) == 1:
            res = math_ops.matmul(args[0], weights)
        else:
            res = math_ops.matmul(array_ops.concat(args, 1), weights)

        res = tf.cond(normalize, lambda: tf.contrib.layers.layer_norm(res),
                      lambda: res)

        # remove the layer's bias if there is one (because it would be redundant)
        if not bias:
            return res

        with vs.variable_scope(outer_scope) as inner_scope:
            inner_scope.set_partitioner(None)
            if bias_initializer is None:
                bias_initializer = init_ops.constant_initializer(0.0,
                                                                 dtype=dtype)
            biases = vs.get_variable(_BIAS_VARIABLE_NAME, [output_size],
                                     dtype=dtype,
                                     initializer=bias_initializer,
                                     regularizer=bias_regularizer)

    return nn_ops.bias_add(res, biases)
Example #52
0
    def call(self, inputs, state):
        """Run one step of LSTM.
        Args:
          inputs: input Tensor, 2D, `[batch, num_units].
          state: if `state_is_tuple` is False, this must be a state Tensor,
            `2-D, [batch, state_size]`.  If `state_is_tuple` is True, this must be a
            tuple of state Tensors, both `2-D`, with column sizes `c_state` and
            `m_state`.
        Returns:
          A tuple containing:
          - A `2-D, [batch, output_dim]`, Tensor representing the output of the
            LSTM after reading `inputs` when previous state was `state`.
            Here output_dim is:
               num_proj if num_proj was set,
               num_units otherwise.
          - Tensor(s) representing the new state of LSTM after reading `inputs` when
            the previous state was `state`.  Same type and shape(s) as `state`.
        Raises:
          ValueError: If input size cannot be inferred from inputs via
            static shape inference.
        """
        num_proj = self._num_units if self._num_proj is None else self._num_proj
        sigmoid = math_ops.sigmoid

        if self._state_is_tuple:
            (c_prev, h_prev) = state
        else:
            c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
            h_prev = array_ops.slice(state, [0, self._num_units],
                                     [-1, num_proj])

        input_size = inputs.get_shape().with_rank(2)[1]
        if input_size.value is None:
            raise ValueError(
                "Could not infer input size from inputs.get_shape()[-1]")

        # i = input_gate, j = new_input, f = forget_gate, o = output_gate
        # calculate softmaxed output
        y_prev = softmax(math_ops.matmul(self._y_w, h_prev) + self._y_b)

        lstm_matrix = math_ops.matmul(
            array_ops.concat([inputs, h_prev, y_prev], 1), self._kernel)
        lstm_matrix = nn_ops.bias_add(lstm_matrix, self._bias)

        i, j, f, o = array_ops.split(value=lstm_matrix,
                                     num_or_size_splits=4,
                                     axis=1)
        # Diagonal connections
        if self._use_peepholes:
            c = (sigmoid(f + self._forget_bias + self._w_f_diag * c_prev) *
                 c_prev +
                 sigmoid(i + self._w_i_diag * c_prev) * self._activation(j))
        else:
            c = (sigmoid(f + self._forget_bias) * c_prev +
                 sigmoid(i) * self._activation(j))

        if self._cell_clip is not None:
            # pylint: disable=invalid-unary-operand-type
            c = clip_ops.clip_by_value(c, -self._cell_clip, self._cell_clip)
            # pylint: enable=invalid-unary-operand-type
        if self._use_peepholes:
            h = sigmoid(o + self._w_o_diag * c) * self._activation(c)
        else:
            h = sigmoid(o) * self._activation(c)

        if self._num_proj is not None:
            h = math_ops.matmul(h, self._proj_kernel)

            if self._proj_clip is not None:
                # pylint: disable=invalid-unary-operand-type
                h = clip_ops.clip_by_value(h, -self._proj_clip,
                                           self._proj_clip)
                # pylint: enable=invalid-unary-operand-type

        new_state = (LSTMStateTuple(c, h)
                     if self._state_is_tuple else array_ops.concat([c, h], 1))
        return h, new_state
    def __call__(self, input_x, input_t, previous_sate):
        """
        :param input_x: tf.placeholder with shape [batch_size, input_x_depth], dtype=tf.float64
        :param input_t: tf.placeholder a matrix with shape [batch_size, input_t_depth], dtype=tf.float64
        :param previous_sate: the previous hidden states is a tensor with size [batch_size, hidden_state],
        dtype=tf.float64. if the previous state is the zero state, the size should be [batch_size,], dtype=tf.float64.
        :return: new hidden state with size [batch_size, hidden_state], dtype=tf.float64.
        """
        if (previous_sate.shape.dims == 1 and previous_sate.shape[0].value != self.__hidden_state) or \
                (previous_sate.shape.dims == 2 and previous_sate.shape[1].value != self.__hidden_state):
            raise ValueError('previous state/ zero state size incompatible')

        # get predefined variable
        td = input_t.shape[1].value
        hs = self.__hidden_state
        xd = input_x.shape[1].value
        gw = self.__initial_strategy_map['gate_weight']
        gb = self.__initial_strategy_map['gate_bias']
        cw = self.__initial_strategy_map['candidate_weight']
        cb = self.__initial_strategy_map['candidate_bias']

        # define the parameter a GRU cell
        with tf.variable_scope("cell_para", reuse=tf.AUTO_REUSE):
            gate_weight = tf.get_variable(name='gate_weight',
                                          shape=[td, hs * 2],
                                          initializer=gw,
                                          dtype=tf.float64)
            gate_bias = tf.get_variable(name='gate_bias',
                                        shape=[hs * 2],
                                        initializer=gb,
                                        dtype=tf.float64)
            candidate_weight = tf.get_variable(name='candidate_weight',
                                               shape=[xd + hs, hs],
                                               initializer=cw,
                                               dtype=tf.float64)
            candidate_bias = tf.get_variable(name='candidate_bias',
                                             shape=[hs],
                                             initializer=cb,
                                             dtype=tf.float64)

        with tf.name_scope('gate_calc'):
            gate_value = math_ops.matmul(input_t, gate_weight)
            gate_value = nn_ops.bias_add(gate_value, gate_bias)
            gate_value = math_ops.sigmoid(gate_value)

        with tf.name_scope('gate_split'):
            r, u = array_ops.split(value=gate_value,
                                   num_or_size_splits=2,
                                   axis=1)
            r_state = r * previous_sate

        with tf.name_scope('candidate_calc'):
            concat = array_ops.concat([input_x, r_state], axis=1)
            candidate = math_ops.matmul(concat,
                                        candidate_weight,
                                        name='candidate_matmul')
            candidate = nn_ops.bias_add(candidate,
                                        candidate_bias,
                                        name='candidate_bias_add')
            c = self.__activation(candidate)

        with tf.name_scope('new_state'):
            new_h = u * previous_sate + (1 - u) * c

        return new_h
Example #54
0
 def f():
     x = constant_op.constant([[1, 2], [3, 4]])
     out = math_ops.matmul(v, x)
     self.assertEqual(out.get_shape(),
                      tensor_shape.TensorShape([2, 2]))
  def _grad(op, grad):
    """A gradient function for RFFT with the provided `rank` and `irfft_fn`."""
    fft_length = op.inputs[1]
    input_shape = _array_ops.shape(op.inputs[0])
    is_even = _math_ops.cast(1 - (fft_length[-1] % 2), _dtypes.complex64)

    def _tile_for_broadcasting(matrix, t):
      expanded = _array_ops.reshape(
          matrix,
          _array_ops.concat([
              _array_ops.ones([_array_ops.rank(t) - 2], _dtypes.int32),
              _array_ops.shape(matrix)
          ], 0))
      return _array_ops.tile(
          expanded, _array_ops.concat([_array_ops.shape(t)[:-2], [1, 1]], 0))

    def _mask_matrix(length):
      """Computes t_n = exp(sqrt(-1) * pi * n^2 / line_len)."""
      # TODO(rjryan): Speed up computation of twiddle factors using the
      # following recurrence relation and cache them across invocations of RFFT.
      #
      # t_n = exp(sqrt(-1) * pi * n^2 / line_len)
      # for n = 0, 1,..., line_len-1.
      # For n > 2, use t_n = t_{n-1}^2 / t_{n-2} * t_1^2
      a = _array_ops.tile(
          _array_ops.expand_dims(_math_ops.range(length), 0), (length, 1))
      b = _array_ops.transpose(a, [1, 0])
      return _math_ops.exp(
          -2j * np.pi * _math_ops.cast(a * b, _dtypes.complex64) /
          _math_ops.cast(length, _dtypes.complex64))

    def _ymask(length):
      """A sequence of [1+0j, -1+0j, 1+0j, -1+0j, ...] with length `length`."""
      return _math_ops.cast(1 - 2 * (_math_ops.range(length) % 2),
                            _dtypes.complex64)

    y0 = grad[..., 0:1]
    if rank == 1:
      ym = grad[..., -1:]
      extra_terms = y0 + is_even * ym * _ymask(input_shape[-1])
    elif rank == 2:
      # Create a mask matrix for y0 and ym.
      base_mask = _mask_matrix(input_shape[-2])

      # Tile base_mask to match y0 in shape so that we can batch-matmul the
      # inner 2 dimensions.
      tiled_mask = _tile_for_broadcasting(base_mask, y0)

      y0_term = _math_ops.matmul(tiled_mask, _math_ops.conj(y0))
      extra_terms = y0_term

      ym = grad[..., -1:]
      ym_term = _math_ops.matmul(tiled_mask, _math_ops.conj(ym))

      inner_dim = input_shape[-1]
      ym_term = _array_ops.tile(
          ym_term,
          _array_ops.concat([
              _array_ops.ones([_array_ops.rank(grad) - 1], _dtypes.int32),
              [inner_dim]
          ], 0)) * _ymask(inner_dim)

      extra_terms += is_even * ym_term

    # The gradient of RFFT is the IRFFT of the incoming gradient times a scaling
    # factor, plus some additional terms to make up for the components dropped
    # due to Hermitian symmetry.
    input_size = _math_ops.cast(
        _fft_size_for_grad(op.inputs[0], rank), _dtypes.float32)
    the_irfft = irfft_fn(grad, fft_length)
    return 0.5 * (the_irfft * input_size + _math_ops.real(extra_terms)), None
Example #56
0
def _compute_sampled_logits(weights,
                            biases,
                            inputs,
                            labels,
                            num_sampled,
                            num_classes,
                            num_true=1,
                            sampled_values=None,
                            subtract_log_q=True,
                            remove_accidental_hits=False,
                            partition_strategy="mod",
                            name=None):
    """Helper function for nce_loss and sampled_softmax_loss functions.

  Computes sampled output training logits and labels suitable for implementing
  e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see
  sampled_softmax_loss).

  Note: In the case where num_true > 1, we assign to each target class
  the target probability 1 / num_true so that the target probabilities
  sum to 1 per-example.

  Args:
    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
        objects whose concatenation along dimension 0 has shape
        `[num_classes, dim]`.  The (possibly-partitioned) class embeddings.
    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
        activations of the input network.
    labels: A `Tensor` of type `int64` and shape `[batch_size,
        num_true]`. The target classes.  Note that this format differs from
        the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
    num_sampled: An `int`.  The number of classes to randomly sample per batch.
    num_classes: An `int`. The number of possible classes.
    num_true: An `int`.  The number of target classes per training example.
    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
        `sampled_expected_count`) returned by a `*_candidate_sampler` function.
        (if None, we default to `log_uniform_candidate_sampler`)
    subtract_log_q: A `bool`.  whether to subtract the log expected count of
        the labels in the sample to get the logits of the true labels.
        Default is True.  Turn off for Negative Sampling.
    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
        where a sampled class equals one of the target classes.  Default is
        False.
    partition_strategy: A string specifying the partitioning strategy, relevant
        if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
        Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
    name: A name for the operation (optional).
  Returns:
    out_logits, out_labels: `Tensor` objects each with shape
        `[batch_size, num_true + num_sampled]`, for passing to either
        `nn.sigmoid_cross_entropy_with_logits` (NCE) or
        `nn.softmax_cross_entropy_with_logits` (sampled softmax).
  """

    if not isinstance(weights, list):
        weights = [weights]

    with ops.op_scope(weights + [biases, inputs, labels], name,
                      "compute_sampled_logits"):
        if labels.dtype != dtypes.int64:
            labels = math_ops.cast(labels, dtypes.int64)
        labels_flat = array_ops.reshape(labels, [-1])

        # Sample the negative labels.
        #   sampled shape: [num_sampled] tensor
        #   true_expected_count shape = [batch_size, 1] tensor
        #   sampled_expected_count shape = [num_sampled] tensor
        if sampled_values is None:
            sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler(
                true_classes=labels,
                num_true=num_true,
                num_sampled=num_sampled,
                unique=True,
                range_max=num_classes)
        # NOTE: pylint cannot tell that 'sampled_values' is a sequence
        # pylint: disable=unpacking-non-sequence
        sampled, true_expected_count, sampled_expected_count = sampled_values
        # pylint: enable=unpacking-non-sequence

        # labels_flat is a [batch_size * num_true] tensor
        # sampled is a [num_sampled] int tensor
        all_ids = array_ops.concat(0, [labels_flat, sampled])

        # weights shape is [num_classes, dim]
        all_w = embedding_ops.embedding_lookup(
            weights, all_ids, partition_strategy=partition_strategy)
        all_b = embedding_ops.embedding_lookup(biases, all_ids)
        # true_w shape is [batch_size * num_true, dim]
        # true_b is a [batch_size * num_true] tensor
        true_w = array_ops.slice(
            all_w, [0, 0],
            array_ops.pack([array_ops.shape(labels_flat)[0], -1]))
        true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat))

        # inputs shape is [batch_size, dim]
        # true_w shape is [batch_size * num_true, dim]
        # row_wise_dots is [batch_size, num_true, dim]
        dim = array_ops.shape(true_w)[1:2]
        new_true_w_shape = array_ops.concat(0, [[-1, num_true], dim])
        row_wise_dots = math_ops.mul(
            array_ops.expand_dims(inputs, 1),
            array_ops.reshape(true_w, new_true_w_shape))
        # We want the row-wise dot plus biases which yields a
        # [batch_size, num_true] tensor of true_logits.
        dots_as_matrix = array_ops.reshape(row_wise_dots,
                                           array_ops.concat(0, [[-1], dim]))
        true_logits = array_ops.reshape(_sum_rows(dots_as_matrix),
                                        [-1, num_true])
        true_b = array_ops.reshape(true_b, [-1, num_true])
        true_logits += true_b

        # Lookup weights and biases for sampled labels.
        #   sampled_w shape is [num_sampled, dim]
        #   sampled_b is a [num_sampled] float tensor
        sampled_w = array_ops.slice(
            all_w, array_ops.pack([array_ops.shape(labels_flat)[0], 0]),
            [-1, -1])
        sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1])

        # inputs has shape [batch_size, dim]
        # sampled_w has shape [num_sampled, dim]
        # sampled_b has shape [num_sampled]
        # Apply X*W'+B, which yields [batch_size, num_sampled]
        sampled_logits = math_ops.matmul(inputs, sampled_w,
                                         transpose_b=True) + sampled_b

        if remove_accidental_hits:
            acc_hits = candidate_sampling_ops.compute_accidental_hits(
                labels, sampled, num_true=num_true)
            acc_indices, acc_ids, acc_weights = acc_hits

            # This is how SparseToDense expects the indices.
            acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1])
            acc_ids_2d_int32 = array_ops.reshape(
                math_ops.cast(acc_ids, dtypes.int32), [-1, 1])
            sparse_indices = array_ops.concat(
                1, [acc_indices_2d, acc_ids_2d_int32], "sparse_indices")
            # Create sampled_logits_shape = [batch_size, num_sampled]
            sampled_logits_shape = array_ops.concat(0, [
                array_ops.shape(labels)[:1],
                array_ops.expand_dims(num_sampled, 0)
            ])
            if sampled_logits.dtype != acc_weights.dtype:
                acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype)
            sampled_logits += sparse_ops.sparse_to_dense(
                sparse_indices,
                sampled_logits_shape,
                acc_weights,
                default_value=0.0,
                validate_indices=False)

        if subtract_log_q:
            # Subtract log of Q(l), prior probability that l appears in sampled.
            true_logits -= math_ops.log(true_expected_count)
            sampled_logits -= math_ops.log(sampled_expected_count)

        # Construct output logits and labels. The true labels/logits start at col 0.
        out_logits = array_ops.concat(1, [true_logits, sampled_logits])
        # true_logits is a float tensor, ones_like(true_logits) is a float tensor
        # of ones. We then divide by num_true to ensure the per-example labels sum
        # to 1.0, i.e. form a proper probability distribution.
        out_labels = array_ops.concat(1, [
            array_ops.ones_like(true_logits) / num_true,
            array_ops.zeros_like(sampled_logits)
        ])

    return out_logits, out_labels
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    shutil.rmtree(FLAGS.saved_model_path)

    # Create the graph
    x1 = array_ops.placeholder(dtypes.int32, shape=(1, 3), name='input1')
    # 'y' is a read-only Reference Variable in this test case, which will be
    # converted to Resource Variable in the MLIR lowering pass.
    y = variable_scope.get_variable(name='y', initializer=[[1], [2], [3]])

    r1 = math_ops.matmul(x1, y, name='result1')

    x2 = array_ops.placeholder(dtypes.int32, shape=(1, 3), name='input2')
    r21 = math_ops.matmul(x2, y, name='result21')
    t2 = math_ops.add(x2, 1)
    r22 = math_ops.matmul(t2, y, name='result22')

    x3 = array_ops.placeholder(dtypes.int32, shape=(1, 3), name='input3')
    r31 = math_ops.matmul(x3, y, name='result31')
    r32 = math_ops.add(x3, r31, name='result32')
    r33 = math_ops.add(x3, r32, name='result33')

    sess = session.Session()

    sess.run(variables.global_variables_initializer())

    sm_builder = builder.SavedModelBuilder(FLAGS.saved_model_path)
    tensor_info_x1 = utils.build_tensor_info(x1)
    tensor_info_x2 = utils.build_tensor_info(x2)
    tensor_info_x3 = utils.build_tensor_info(x3)
    tensor_info_r1 = utils.build_tensor_info(r1)
    tensor_info_r21 = utils.build_tensor_info(r21)
    tensor_info_r22 = utils.build_tensor_info(r22)
    tensor_info_r31 = utils.build_tensor_info(r31)
    tensor_info_r32 = utils.build_tensor_info(r32)
    tensor_info_r33 = utils.build_tensor_info(r33)

    toy_signature = (signature_def_utils.build_signature_def(
        inputs={'x1': tensor_info_x1},
        outputs={'r1': tensor_info_r1},
        method_name=signature_constants.PREDICT_METHOD_NAME))
    another_toy_signature = (signature_def_utils.build_signature_def(
        inputs={'x2': tensor_info_x2},
        outputs={
            'r21': tensor_info_r21,
            'r22': tensor_info_r22,
        },
        method_name=signature_constants.PREDICT_METHOD_NAME))
    yet_another_toy_signature = (signature_def_utils.build_signature_def(
        inputs={'x3': tensor_info_x3},
        outputs={
            'r31': tensor_info_r31,
            'r32': tensor_info_r32,
            'r33': tensor_info_r33,
        },
        method_name=signature_constants.PREDICT_METHOD_NAME))

    sm_builder.add_meta_graph_and_variables(
        sess, [tag_constants.SERVING],
        signature_def_map={
            'toy': toy_signature,
            'another_toy': another_toy_signature,
            'yet_another_toy': yet_another_toy_signature,
            signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
            toy_signature,
        },
        strip_default_attrs=True)
    sm_builder.save()
Example #58
0
 def loss():
     pred = math_ops.matmul(
         embedding_ops.embedding_lookup([var0], [0]), x)  # pylint: disable=cell-var-from-loop
     return pred * pred
Example #59
0
 def _expected(mat, tangent):
     return (math_ops.matmul(tangent, mat, transpose_b=True) +
             math_ops.matmul(mat, tangent, transpose_b=True))
Example #60
0
def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum):
  """Helper for einsum() that computes the result of a two-argument einsum().

  Args:
    t0: a `Tensor`
    t0_axis_labels: a string of axis labels.  This string's length must equal
      the rank of t0.
    t1: a `Tensor`
    t1_axis_labels: a string to axis labels.  This string's length must equal
      the rank of t1.
    axes_to_sum: set of labels of axes to be summed over

  Returns:
    A `Tensor` whose elements are obtained by summing, over all axes in
    `axes_to_sum`, the corresponding elements of `t0` and `t1`.

    For example, if t0_axis_labels == 'abijk', t1_axis_labels == 'acjkl', and
    axes_to_sum == {j,k}, this will return a tensor x where

      out[a,b,c,i,l] = sum_j sum_k t0[a,b,i,j,k] * t1[a,c,j,k,l]

  Raises:
    ValueError: if the rank of `t0` does not match the length of
      `t0_axis_labels`, or that of `t1` does not match the length of
      `t1_axis_labels`.
  """
  if len(t0_axis_labels) != len(t0.get_shape()):
    raise ValueError(
        'Tensor t0 of rank %d does not match einsum reduction of length %d' %
        (len(t0.get_shape()), len(t0_axis_labels)))
  if len(t1_axis_labels) != len(t1.get_shape()):
    raise ValueError(
        'Tensor t1 of rank %d does not match einsum reduction of length %d' %
        (len(t1.get_shape()), len(t1_axis_labels)))

  # This function computes the result of a two-argument einsum() using batch
  # matrix multiplication.  This involves
  # 1. transposing t0 and t1 so that axes are in the correct order for
  #    batch matrix multiplication, and
  # 2. reshaping t0 and t1 so that they are both of rank 3.

  # First, we divide axes into three groups:
  #  * "preserved" axes are present in both inputs and the output
  #  * "summed" axes are present in both inputs but not the output
  #  * "broadcast" axes are present in exactly one input and the output
  #
  # As an example, if the einsum is abijk,acjkl->abcil, then "a" is a
  # preserved axis, "b" and "c" are broadcast axes, and "j" and "k" are
  # summed axes.
  assert all(a in t0_axis_labels and a in t1_axis_labels for a in axes_to_sum)
  preserved_axes = (set(t0_axis_labels) & set(t1_axis_labels)) - axes_to_sum
  broadcast_axes = {}
  for i, sym_list in enumerate([t0_axis_labels, t1_axis_labels]):
    broadcast_axes[i] = set(sym_list) - preserved_axes - axes_to_sum

  # Reorder the axes so that:
  # 1. preserved axes come first in both inputs
  # 2. in input 0, broadcast axes come next, followed by summed axes
  # 3. in input 1, summed axes come next, followed by broadcast axes
  def sort_key(input_index, a):
    if a in preserved_axes:
      return (-1, a)
    elif ((input_index == 0 and a in broadcast_axes[0]) or
          (input_index == 1 and a in axes_to_sum)):
      return (0, a)
    else:
      return (1, a)

  axis_labels = [t0_axis_labels, t1_axis_labels]
  sorted_axes = [
      sorted(sym_list, key=lambda a: sort_key(i, a))
      for i, sym_list in enumerate(axis_labels)
  ]
  inputs = [t0, t1]
  for i, axes_str in enumerate(axis_labels):
    perm = [axes_str.find(a) for a in sorted_axes[i]]
    inputs[i] = _transpose_if_necessary(inputs[i], perm)
  t0, t1 = inputs

  if not axes_to_sum:
    # In the special case where there are no axes to sum over, reduce to mul()
    # rather than to batch matrix multiplication.
    for _ in broadcast_axes[1]:
      t0 = array_ops.expand_dims(t0, -1)
    for _ in broadcast_axes[0]:
      t1 = array_ops.expand_dims(t1, len(preserved_axes))
    product = math_ops.multiply(t0, t1)
    product_axes = sorted_axes[0] + sorted_axes[1][len(preserved_axes):]
    return product, ''.join(product_axes)
  else:
    # Reduce to matmul().

    # Reshape both inputs so as to combine multiple broadcast axes
    # into a single axis, and combine multiple summed axes into a
    # single axis.

    t0_shape = _get_shape(t0)
    num_broadcast_elements_t0 = _total_size(
        t0_shape[len(preserved_axes):-len(axes_to_sum)])
    num_summed_elements = _total_size(t0_shape[-len(axes_to_sum):])
    new_shape = (
        t0_shape[:len(preserved_axes)] +
        [num_broadcast_elements_t0, num_summed_elements])
    t0 = _reshape_if_necessary(t0, new_shape)

    t1_shape = _get_shape(t1)
    num_broadcast_elements_t1 = _total_size(
        t1_shape[len(preserved_axes) + len(axes_to_sum):])
    new_shape = (
        t1_shape[:len(preserved_axes)] +
        [num_summed_elements, num_broadcast_elements_t1])
    t1 = _reshape_if_necessary(t1, new_shape)

    product = math_ops.matmul(t0, t1)

    # Undo compaction of broadcast axes
    uncompacted_shape = (
        t0_shape[:len(preserved_axes) + len(broadcast_axes[0])] +
        t1_shape[len(t1_shape) - len(broadcast_axes[1]):])
    product = _reshape_if_necessary(product, uncompacted_shape)

    product_axes = (
        sorted_axes[0][:len(preserved_axes) + len(broadcast_axes[0])] +
        sorted_axes[1][len(sorted_axes[1]) - len(broadcast_axes[1]):])

    return product, ''.join(product_axes)