def _stride_size(node, name_to_node):
  """Computes stride size given a TF node.

  Args:
    node: Tensorflow node (NodeDef proto).

  Returns:
    stride_x: Stride size for horizontal direction (integer).
    stride_y: Stride size for vertical direction (integer).
  """
  if node.op == "MaxPoolV2":
    strides_input_name = node.input[2]
    if not strides_input_name.endswith("/strides"):
      raise ValueError("Strides name does not end with '/strides'")
    strides_node = name_to_node[strides_input_name]
    value = strides_node.attr["value"]
    t = make_ndarray(value.tensor)
    stride_y = t[1]
    stride_x = t[2]
  else:
    strides_attr = node.attr["strides"]
    logging.vlog(4, "strides_attr = %s", strides_attr)
    stride_y = strides_attr.list.i[1]
    stride_x = strides_attr.list.i[2]
  return stride_x, stride_y
def _conv_kernel_size(node, name_to_order_node):
  """Computes kernel size given a TF convolution or pooling node.

  Args:
    node: Tensorflow node (NodeDef proto).
    name_to_order_node: Map from name to {order, node}. Output of
      graph_compute_order.get_compute_order().

  Returns:
    kernel_size_x: Kernel size for horizontal direction (integer).
    kernel_size_y: Kernel size for vertical direction (integer).

  Raises:
    ValueError: If the weight layer node is invalid.
  """
  weights_layer_read_name = node.input[1]
  if not weights_layer_read_name.endswith("/read"):
    raise ValueError(
        "Weight layer's name input to conv layer does not end with '/read'")
  weights_layer_param_name = weights_layer_read_name[:-5]
  weights_node = name_to_order_node[weights_layer_param_name].node
  if weights_node.op != "VariableV2":
    raise ValueError("Weight layer is not of type VariableV2")
  shape = weights_node.attr["shape"]
  logging.vlog(4, "weight shape = %s", shape)
  kernel_size_y = shape.shape.dim[0].size
  kernel_size_x = shape.shape.dim[1].size
  return kernel_size_x, kernel_size_y
def _get_layer_params(node, name_to_order_node):
  """Gets layer parameters relevant for RF computation.

  Currently, only these nodes are supported:
  - Conv2D
  - DepthwiseConv2dNative
  - Pad
  - MaxPool
  - AvgPool
  - all nodes listed in _UNCHANGED_RF_LAYER_OPS

  Args:
    node: Tensorflow node (NodeDef proto).
    name_to_order_node: Map from name to {order, node}. Output of
      graph_compute_order.get_compute_order().

  Returns:
    kernel_size_x: Kernel size for horizontal direction (integer).
    kernel_size_y: Kernel size for vertical direction (integer).
    stride_x: Stride size for horizontal direction (integer).
    stride_y: Stride size for vertical direction (integer).
    padding_x: Padding size for horizontal direction (integer).
    padding_y: Padding size for vertical direction (integer).

  Raises:
    ValueError: If layer op is unknown.
  """
  logging.vlog(3, "node.op = %s", node.op)
  logging.vlog(4, "node = %s", node)
  if node.op == "Conv2D" or node.op == "DepthwiseConv2dNative":
    stride_x, stride_y = _stride_size(node)
    kernel_size_x, kernel_size_y = _conv_kernel_size(node, name_to_order_node)
    # Compute the padding for this node separately for each direction.
    padding_x = _padding_size_conv_pool(node, kernel_size_x, stride_x)
    padding_y = _padding_size_conv_pool(node, kernel_size_y, stride_y)
  elif node.op == "Pad":
    # Kernel and stride are simply 1 in this case.
    kernel_size_x = 1
    kernel_size_y = 1
    stride_x = 1
    stride_y = 1
    padding_x, padding_y = _padding_size_pad_layer(node, name_to_order_node)
  elif node.op == "MaxPool" or node.op == "AvgPool":
    stride_x, stride_y = _stride_size(node)
    kernel_size_x, kernel_size_y = _pool_kernel_size(node)
    # Compute the padding for this node separately for each direction.
    padding_x = _padding_size_conv_pool(node, kernel_size_x, stride_x)
    padding_y = _padding_size_conv_pool(node, kernel_size_y, stride_y)
  elif node.op in _UNCHANGED_RF_LAYER_OPS:
    # These nodes do not modify the RF parameters.
    kernel_size_x = 1
    kernel_size_y = 1
    stride_x = 1
    stride_y = 1
    padding_x = 0
    padding_y = 0
  else:
    raise ValueError("Unknown layer for operation '%s': %s" % (node.name,
                                                               node.op))
  return kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x, padding_y
Beispiel #4
0
  def register(self, candidate, name=None):
    """Registers a Python object "candidate" for the given "name".

    Args:
      candidate: The candidate object to add to the registry.
      name: An optional string specifying the registry key for the candidate.
            If None, candidate.__name__ will be used.
    Raises:
      KeyError: If same name is used twice.
    """
    if not name:
      name = candidate.__name__
    if name in self._registry:
      (filename, line_number, function_name, _) = (
          self._registry[name][_LOCATION_TAG])
      raise KeyError("Registering two %s with name '%s'! "
                     "(Previous registration was in %s %s:%d)" %
                     (self._name, name, function_name, filename, line_number))

    logging.vlog(1, "Registering %s (%s) in %s.", name, candidate, self._name)
    # stack trace is [this_function, Register(), user_function,...]
    # so the user function is #2.
    stack = tf_stack.extract_stack()
    stack_index = min(2, len(stack)-1)
    if stack_index >= 0:
      user_function = stack[stack_index]
      location_tag = tf_stack.convert_stack([user_function])[0]
    else:
      location_tag = "UNKNOWN"
    self._registry[name] = {_TYPE_TAG: candidate, _LOCATION_TAG: location_tag}
Beispiel #5
0
 def _runSamplingBenchmark(self, name, create_distribution, use_gpu,
                           num_components, batch_size, num_features,
                           sample_size):
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
   np.random.seed(127)
   with session.Session(config=config, graph=ops.Graph()) as sess:
     random_seed.set_random_seed(0)
     with ops.device("/device:GPU:0" if use_gpu else "/cpu:0"):
       mixture = create_distribution(
           num_components=num_components,
           batch_size=batch_size,
           num_features=num_features)
       sample_op = mixture.sample(sample_size).op
       sess.run(variables.global_variables_initializer())
       reported = self.run_op_benchmark(
           sess,
           sample_op,
           min_iters=10,
           name=("%s_%s_components_%d_batch_%d_features_%d_sample_%d" %
                 (name, use_gpu, num_components, batch_size, num_features,
                  sample_size)))
       logging.vlog(2, "\t".join(["%s", "%d", "%d", "%d", "%d", "%g"]) % (
           use_gpu, num_components, batch_size, num_features, sample_size,
           reported["wall_time"]))
  def _time_performance_run_unifed_lstm_gpu(
      self, test_config, x_train, y_train):
    # Get performance number for Unified_LSTM with grappler swap the impl
    input_shape = test_config['input_shape']
    rnn_state_size = test_config['rnn_state_size']
    timestep = test_config['timestep']
    epoch = test_config['epoch']
    warmup_epoch = test_config['warmup_epoch']

    ops.reset_default_graph()
    K.set_session(session.Session(config=self.config))
    layer = UnifiedLSTM(rnn_state_size)
    inputs = keras.layers.Input(
        shape=[timestep, input_shape], dtype=dtypes.float32)

    outputs, _ = layer(inputs)
    model = keras.models.Model(inputs, outputs)
    model.compile('sgd', 'mse')

    total_duration = 0
    for i in range(epoch):
      start_time = time.time()
      model.fit(x_train, y_train)
      end_time = time.time()
      if i >= warmup_epoch:
        duration_per_epoch = end_time - start_time
        total_duration += duration_per_epoch
        logging.vlog(2, '%s: Time consumed for epoch %d is: %s',
                     'Unified LSTM', i, duration_per_epoch)
    logging.info('Average performance for %s per epoch is: %s',
                 'Unified LSTM', (total_duration / epoch))
    return total_duration / epoch
  def _time_performance_run_normal_lstm(
      self, test_config, x_train, y_train):
    # Get performance number for standard LSTM on GPU.
    input_shape = test_config['input_shape']
    rnn_state_size = test_config['rnn_state_size']
    timestep = test_config['timestep']
    epoch = test_config['epoch']
    warmup_epoch = test_config['warmup_epoch']

    ops.reset_default_graph()
    with self.test_session(use_gpu=True):
      layer = keras.layers.LSTM(rnn_state_size)
      inputs = keras.layers.Input(
          shape=[timestep, input_shape], dtype=dtypes.float32)

      outputs = layer(inputs)
      model = keras.models.Model(inputs, outputs)
      model.compile('sgd', 'mse')

      total_duration = 0
      for i in range(epoch):
        start_time = time.time()
        model.fit(x_train, y_train)
        end_time = time.time()
        if i >= warmup_epoch:
          duration_per_epoch = end_time - start_time
          total_duration += duration_per_epoch
          logging.vlog(2, '%s: Time consumed for epoch %d is: %s',
                       'Normal LSTM', i, duration_per_epoch)
      logging.info('Average performance for %s per epoch is: %s',
                   'Normal LSTM', (total_duration / epoch))
      return total_duration / epoch
Beispiel #8
0
  def _ais_gets_correct_log_normalizer(self, init, event_dims, sess,
                                       feed_dict=None):
    def proposal_log_prob(x):
      return math_ops.reduce_sum(-0.5 * x * x - 0.5 * np.log(2*np.pi),
                                 event_dims)

    def target_log_prob(x):
      return self._log_gamma_log_prob(x, event_dims)

    if feed_dict is None:
      feed_dict = {}

    w, _, _ = hmc.ais_chain(200, 0.5, 2, init, target_log_prob,
                            proposal_log_prob, event_dims)

    w_val = sess.run(w, feed_dict)
    init_shape = sess.run(init, feed_dict).shape
    normalizer_multiplier = np.prod([init_shape[i] for i in event_dims])

    true_normalizer = -self._shape_param * np.log(self._rate_param)
    true_normalizer += special.gammaln(self._shape_param)
    true_normalizer *= normalizer_multiplier

    n_weights = np.prod(w_val.shape)
    normalized_w = np.exp(w_val - true_normalizer)
    standard_error = np.std(normalized_w) / np.sqrt(n_weights)
    logging.vlog(1, 'True normalizer {}, estimated {}, n_weights {}'.format(
        true_normalizer, np.log(normalized_w.mean()) + true_normalizer,
        n_weights))
    self.assertNear(normalized_w.mean(), 1.0, 4.0 * standard_error)
Beispiel #9
0
  def _integrator_conserves_energy(self, x, event_dims, sess,
                                   feed_dict=None):
    def potential_and_grad(x):
      log_prob, grad = self._log_gamma_log_prob_grad(x, event_dims)
      return -log_prob, -grad

    step_size = array_ops.placeholder(np.float32, [], name='step_size')
    hmc_lf_steps = array_ops.placeholder(np.int32, [], name='hmc_lf_steps')

    if feed_dict is None:
      feed_dict = {}
    feed_dict[hmc_lf_steps] = 1000

    m = random_ops.random_normal(array_ops.shape(x))
    potential_0, grad_0 = potential_and_grad(x)
    old_energy = potential_0 + 0.5 * math_ops.reduce_sum(m * m,
                                                         event_dims)

    _, new_m, potential_1, _ = (
        hmc.leapfrog_integrator(step_size, hmc_lf_steps, x,
                                m, potential_and_grad, grad_0))

    new_energy = potential_1 + 0.5 * math_ops.reduce_sum(new_m * new_m,
                                                         event_dims)

    x_shape = sess.run(x, feed_dict).shape
    n_event_dims = self._n_event_dims(x_shape, event_dims)
    feed_dict[step_size] = 0.1 / n_event_dims
    old_energy_val, new_energy_val = sess.run([old_energy, new_energy],
                                              feed_dict)
    logging.vlog(1, 'average energy change: {}'.format(
        abs(old_energy_val - new_energy_val).mean()))

    self.assertAllEqual(np.ones_like(new_energy_val, dtype=np.bool),
                        abs(old_energy_val - new_energy_val) < 1.)
Beispiel #10
0
  def benchmarkSamplingMVNDiag(self):
    logging.vlog(
        2, "mvn_diag\tuse_gpu\tcomponents\tbatch\tfeatures\tsample\twall_time")

    def create_distribution(batch_size, num_components, num_features):
      cat = ds.Categorical(
          logits=np.random.randn(batch_size, num_components))
      mus = [
          variables.Variable(np.random.randn(batch_size, num_features))
          for _ in range(num_components)
      ]
      sigmas = [
          variables.Variable(np.random.rand(batch_size, num_features))
          for _ in range(num_components)
      ]
      components = list(
          ds.MultivariateNormalDiag(
              loc=mu, scale_diag=sigma) for (mu, sigma) in zip(mus, sigmas))
      return ds.Mixture(cat, components, use_static_graph=self.use_static_graph)

    for use_gpu in False, True:
      if use_gpu and not test.is_gpu_available():
        continue
      for num_components in 1, 8, 16:
        for batch_size in 1, 32:
          for num_features in 1, 64, 512:
            for sample_size in 1, 32, 128:
              self._runSamplingBenchmark(
                  "mvn_diag",
                  create_distribution=create_distribution,
                  use_gpu=use_gpu,
                  num_components=num_components,
                  batch_size=batch_size,
                  num_features=num_features,
                  sample_size=sample_size)
Beispiel #11
0
  def _chain_gets_correct_expectations(self, x, event_dims, sess,
                                       feed_dict=None):
    def log_gamma_log_prob(x):
      return self._log_gamma_log_prob(x, event_dims)

    step_size = array_ops.placeholder(np.float32, [], name='step_size')
    hmc_lf_steps = array_ops.placeholder(np.int32, [], name='hmc_lf_steps')
    hmc_n_steps = array_ops.placeholder(np.int32, [], name='hmc_n_steps')

    if feed_dict is None:
      feed_dict = {}
    feed_dict.update({step_size: 0.1,
                      hmc_lf_steps: 2,
                      hmc_n_steps: 300})

    sample_chain, acceptance_prob_chain = hmc.chain([hmc_n_steps],
                                                    step_size,
                                                    hmc_lf_steps,
                                                    x, log_gamma_log_prob,
                                                    event_dims)

    acceptance_probs, samples = sess.run([acceptance_prob_chain, sample_chain],
                                         feed_dict)
    samples = samples[feed_dict[hmc_n_steps] // 2:]
    expected_x_est = samples.mean()
    expected_exp_x_est = np.exp(samples).mean()

    logging.vlog(1, 'True      E[x, exp(x)]: {}\t{}'.format(
        self._expected_x, self._expected_exp_x))
    logging.vlog(1, 'Estimated E[x, exp(x)]: {}\t{}'.format(
        expected_x_est, expected_exp_x_est))
    self.assertNear(expected_x_est, self._expected_x, 2e-2)
    self.assertNear(expected_exp_x_est, self._expected_exp_x, 2e-2)
    self.assertTrue((acceptance_probs > 0.5).all())
    self.assertTrue((acceptance_probs <= 1.0).all())
 def testImplicitLargeDiag(self):
   mu = np.array([[1., 2, 3],
                  [11, 22, 33]])      # shape: [b, k] = [2, 3]
   u = np.array([[[1., 2],
                  [3, 4],
                  [5, 6]],
                 [[0.5, 0.75],
                  [1, 0.25],
                  [1.5, 1.25]]])      # shape: [b, k, r] = [2, 3, 2]
   m = np.array([[0.1, 0.2],
                 [0.4, 0.5]])         # shape: [b, r] = [2, 2]
   scale = np.stack([
       np.eye(3) + np.matmul(np.matmul(u[0], np.diag(m[0])),
                             np.transpose(u[0])),
       np.eye(3) + np.matmul(np.matmul(u[1], np.diag(m[1])),
                             np.transpose(u[1])),
   ])
   cov = np.stack([np.matmul(scale[0], scale[0].T),
                   np.matmul(scale[1], scale[1].T)])
   logging.vlog(2, "expected_cov:\n{}".format(cov))
   with self.test_session():
     mvn = ds.MultivariateNormalDiagPlusLowRank(
         loc=mu,
         scale_perturb_factor=u,
         scale_perturb_diag=m)
     self.assertAllClose(cov, mvn.covariance().eval(), atol=0., rtol=1e-6)
def _padding_size_conv_pool(node, kernel_size, stride, input_resolution=None):
  """Computes padding size given a TF convolution or pooling node.

  Args:
    node: Tensorflow node (NodeDef proto).
    kernel_size: Kernel size of node (integer).
    stride: Stride size of node (integer).
    input_resolution: Input resolution to assume, if not None (integer).

  Returns:
    total_padding: Total padding size (integer).
    padding: Padding size, applied to the left or top (integer).

  Raises:
    ValueError: If padding is invalid.
  """
  # In this case, we need to carefully consider the different TF padding modes.
  # The padding depends on kernel size, and may depend on input size. If it
  # depends on input size and input_resolution is None, we raise an exception.
  padding_attr = node.attr["padding"]
  logging.vlog(4, "padding_attr = %s", padding_attr)
  if padding_attr.s in _VALID_PADDING:
    total_padding = 0
    padding = 0
  elif padding_attr.s in _SAME_PADDING:
    if input_resolution is None:
      # In this case, we do not know the input resolution, so we can only know
      # the padding in some special cases.
      if kernel_size == 1:
        total_padding = 0
        padding = 0
      elif stride == 1:
        total_padding = kernel_size - 1
        padding = int(math.floor(float(total_padding) / 2))
      elif stride == 2 and kernel_size % 2 == 0:
        # In this case, we can be sure of the left/top padding, but not of the
        # total padding.
        total_padding = None
        padding = int(math.floor((float(kernel_size) - 1) / 2))
      else:
        total_padding = None
        padding = None
        logging.warning(
            "Padding depends on input size, which means that the effective "
            "padding may be different depending on the input image "
            "dimensionality. In this case, alignment check will be skipped. If"
            " you know the input resolution, please set it.")
    else:
      # First, compute total_padding based on documentation.
      if input_resolution % stride == 0:
        total_padding = int(max(float(kernel_size - stride), 0.0))
      else:
        total_padding = int(
            max(float(kernel_size - (input_resolution % stride)), 0.0))
      # Then, compute left/top padding.
      padding = int(math.floor(float(total_padding) / 2))

  else:
    raise ValueError("Invalid padding operation %s" % padding_attr.s)
  return total_padding, padding
def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta,
                              extra_feed_dict):
  """Computes the numeric Jacobian for dy/dx.

  Computes the numeric Jacobian by slightly perturbing the inputs and
  measuring the differences on the output.

  Args:
    x: the tensor "x".
    x_shape: the dimensions of x as a tuple or an array of ints.
    x_data: a numpy array as the input data for x
    y: the tensor "y".
    y_shape: the dimensions of y as a tuple or an array of ints.
    delta: the amount of perturbation we give to the input
    extra_feed_dict: dict that allows fixing specified tensor values
      during the jacobian calculation.

  Returns:
    A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows
    and "y_size" columns where "x_size" is the number of elements in x and
    "y_size" is the number of elements in y.
  """
  # bfloat16 doesn't have enough bits to represent high precision numbers such
  # as delta. Convert to float32 here. Since numeric_jacobian is expected to
  # be the groundtruth to compare against, it shouldn't lose any information.
  if x.dtype == dtypes.bfloat16:
    x = math_ops.cast(x, dtypes.float32)  # TODO(wangpeng): Now that the new x
            # is an output of the old x, isn't feeding to the new x a mistake?
  if y.dtype == dtypes.bfloat16:
    y = math_ops.cast(y, dtypes.float32)
  if x_data.dtype == dtypes.bfloat16.as_numpy_dtype:
    x_data = x_data.astype(np.float32)

  # To compute the jacobian, we treat x and y as one-dimensional vectors
  x_size = _product(x_shape) * (2 if x.dtype.is_complex else 1)
  y_size = _product(y_shape) * (2 if y.dtype.is_complex else 1)
  x_dtype = x.dtype.real_dtype.as_numpy_dtype
  y_dtype = y.dtype.real_dtype.as_numpy_dtype

  # Make sure we have the right types
  x_data = np.asarray(x_data, dtype=x.dtype.as_numpy_dtype)
  scale = np.asarray(2 * delta, dtype=y_dtype)[()]

  jacobian = np.zeros((x_size, y_size), dtype=x_dtype)
  # For each of the entry of x, we slightly perturbs this by adding and
  # subtracting a delta and then compute difference between the outputs. This
  # will give us one row of the Jacobian matrix.
  for row in range(x_size):
    x_pos = x_data.copy()
    x_neg = x_data.copy()
    x_pos.ravel().view(x_dtype)[row] += delta
    y_pos = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_pos}))
    x_neg.ravel().view(x_dtype)[row] -= delta
    y_neg = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_neg}))
    diff = (y_pos - y_neg) / scale
    jacobian[row, :] = diff.ravel().view(y_dtype)

  logging.vlog(1, "Numeric Jacobian =\n%s", jacobian)
  return jacobian
Beispiel #15
0
  def _chain_gets_correct_expectations(self, x, independent_chain_ndims,
                                       sess, feed_dict=None):
    counter = collections.Counter()
    def log_gamma_log_prob(x):
      counter["target_calls"] += 1
      event_dims = math_ops.range(independent_chain_ndims,
                                  array_ops.rank(x))
      return self._log_gamma_log_prob(x, event_dims)

    num_results = array_ops.placeholder(
        np.int32, [], name="num_results")
    step_size = array_ops.placeholder(
        np.float32, [], name="step_size")
    num_leapfrog_steps = array_ops.placeholder(
        np.int32, [], name="num_leapfrog_steps")

    if feed_dict is None:
      feed_dict = {}
    feed_dict.update({num_results: 150,
                      step_size: 0.05,
                      num_leapfrog_steps: 2})

    samples, kernel_results = hmc.sample_chain(
        num_results=num_results,
        target_log_prob_fn=log_gamma_log_prob,
        current_state=x,
        step_size=step_size,
        num_leapfrog_steps=num_leapfrog_steps,
        num_burnin_steps=150,
        seed=42)

    self.assertAllEqual(dict(target_calls=2), counter)

    expected_x = (math_ops.digamma(self._shape_param)
                  - np.log(self._rate_param))

    expected_exp_x = self._shape_param / self._rate_param

    log_accept_ratio_, samples_, expected_x_ = sess.run(
        [kernel_results.log_accept_ratio, samples, expected_x],
        feed_dict)

    actual_x = samples_.mean()
    actual_exp_x = np.exp(samples_).mean()
    acceptance_probs = np.exp(np.minimum(log_accept_ratio_, 0.))

    logging_ops.vlog(1, "True      E[x, exp(x)]: {}\t{}".format(
        expected_x_, expected_exp_x))
    logging_ops.vlog(1, "Estimated E[x, exp(x)]: {}\t{}".format(
        actual_x, actual_exp_x))
    self.assertNear(actual_x, expected_x_, 2e-2)
    self.assertNear(actual_exp_x, expected_exp_x, 2e-2)
    self.assertAllEqual(np.ones_like(acceptance_probs, np.bool),
                        acceptance_probs > 0.5)
    self.assertAllEqual(np.ones_like(acceptance_probs, np.bool),
                        acceptance_probs <= 1.)
Beispiel #16
0
def _compute_theoretical_jacobian(x, x_shape, x_data, dy, dy_shape, dx):
  """Computes the theoretical Jacobian for dy/dx.

  Computes the theoretical Jacobian using the ops generated by
  compute_gradient().

  Args:
    x: the tensor "x".
    x_shape: the dimensions of x as a tuple or an array of ints.
    x_data: a numpy parray as the input data for x
    dy: the tensor "dy".
    dy_shape: the dimensions of dy as a tuple or an array of ints.
    dx: Tensor or IndexedSlices representing dx

  Returns:
    A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows
    and "dy_size" columns where "x_size" is the number of elements in x and
    "dy_size" is the number of elements in dy.
  """
  # Complex vectors are treated as vectors of twice as many reals.
  if x.dtype.is_complex:
    x_shape = tuple(x_shape) + (2,)
  dy_factor = 2 if dy.dtype.is_complex else 1

  # To compute the jacobian, we treat x and y as one-dimensional vectors.
  x_size = _product(x_shape)
  x_val_size = _product(x_shape[1:])  # This is used for sparse gradients
  dy_size = _product(dy_shape) * dy_factor

  jacobian = np.zeros((x_size, dy_size),
                      dtype=x.dtype.real_dtype.as_numpy_dtype)
  # For each of the entry of dy, we set this to be 1 and
  # everything else to be 0 and compute the backprop -- this will give us one
  # one column of the Jacobian matrix.
  dy_data = np.zeros(dy_shape, dtype=dy.dtype.as_numpy_dtype)
  dy_data_flat = dy_data.ravel().view(dy.dtype.real_dtype.as_numpy_dtype)
  sess = ops.get_default_session()
  for col in range(dy_size):
    dy_data_flat[col] = 1
    if isinstance(dx, ops.IndexedSlices):
      backprop_indices, backprop_values = sess.run(
          [dx.indices, dx.values], feed_dict={x: x_data, dy: dy_data})
      for i, v in zip(backprop_indices, backprop_values):
        r_begin = i * x_val_size
        r_end = r_begin + x_val_size
        jacobian[r_begin:r_end, col] += v.flat
    else:
      assert isinstance(dx, ops.Tensor), "dx = " + str(dx)
      backprop = sess.run(dx, feed_dict={x: x_data, dy: dy_data})
      jacobian[:, col] = backprop.ravel().view(jacobian.dtype)
    dy_data_flat[col] = 0

  logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian)
  return jacobian
  def _close_on_stop(self, sess, cancel_op, coord):
    """Close the queue when the Coordinator requests stop.

    Args:
      sess: A Session.
      cancel_op: The Operation to run.
      coord: Coordinator.
    """
    coord.wait_for_stop()
    try:
      sess.run(cancel_op)
    except Exception as e:
      # Intentionally ignore errors from cancel_op.
      logging.vlog(1, "Ignored exception: %s", str(e))
Beispiel #18
0
  def _chain_gets_correct_expectations(self, x, independent_chain_ndims,
                                       sess, feed_dict=None):
    def log_gamma_log_prob(x):
      event_dims = math_ops.range(independent_chain_ndims,
                                  array_ops.rank(x))
      return self._log_gamma_log_prob(x, event_dims)

    num_results = array_ops.placeholder(
        np.int32, [], name="num_results")
    step_size = array_ops.placeholder(
        np.float32, [], name="step_size")
    num_leapfrog_steps = array_ops.placeholder(
        np.int32, [], name="num_leapfrog_steps")

    if feed_dict is None:
      feed_dict = {}
    feed_dict.update({num_results: 150,
                      step_size: 0.1,
                      num_leapfrog_steps: 2})

    samples, kernel_results = hmc.sample_chain(
        num_results=num_results,
        target_log_prob_fn=log_gamma_log_prob,
        current_state=x,
        step_size=step_size,
        num_leapfrog_steps=num_leapfrog_steps,
        num_burnin_steps=150,
        seed=42)

    expected_x = (math_ops.digamma(self._shape_param)
                  - np.log(self._rate_param))

    expected_exp_x = self._shape_param / self._rate_param

    acceptance_probs_, samples_, expected_x_ = sess.run(
        [kernel_results.acceptance_probs, samples, expected_x],
        feed_dict)

    actual_x = samples_.mean()
    actual_exp_x = np.exp(samples_).mean()

    logging_ops.vlog(1, "True      E[x, exp(x)]: {}\t{}".format(
        expected_x_, expected_exp_x))
    logging_ops.vlog(1, "Estimated E[x, exp(x)]: {}\t{}".format(
        actual_x, actual_exp_x))
    self.assertNear(actual_x, expected_x_, 2e-2)
    self.assertNear(actual_exp_x, expected_exp_x, 2e-2)
    self.assertTrue((acceptance_probs_ > 0.5).all())
    self.assertTrue((acceptance_probs_ <= 1.0).all())
def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta,
                              extra_feed_dict):
  """Computes the numeric Jacobian for dy/dx.

  Computes the numeric Jacobian by slightly perturbing the inputs and
  measuring the differences on the output.

  Args:
    x: the tensor "x".
    x_shape: the dimensions of x as a tuple or an array of ints.
    x_data: a numpy array as the input data for x
    y: the tensor "y".
    y_shape: the dimensions of y as a tuple or an array of ints.
    delta: the amount of perturbation we give to the input
    extra_feed_dict: dict that allows fixing specified tensor values
      during the jacobian calculation.

  Returns:
    A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows
    and "y_size" columns where "x_size" is the number of elements in x and
    "y_size" is the number of elements in y.
  """

  # To compute the jacobian, we treat x and y as one-dimensional vectors
  x_size = _product(x_shape) * (2 if x.dtype.is_complex else 1)
  y_size = _product(y_shape) * (2 if y.dtype.is_complex else 1)
  x_dtype = x.dtype.real_dtype.as_numpy_dtype
  y_dtype = y.dtype.real_dtype.as_numpy_dtype

  # Make sure we have the right types
  x_data = np.asarray(x_data, dtype=x.dtype.as_numpy_dtype)
  scale = np.asarray(2 * delta, dtype=y_dtype)[()]

  jacobian = np.zeros((x_size, y_size), dtype=x_dtype)
  # For each of the entry of x, we slightly perturbs this by adding and
  # subtracting a delta and then compute difference between the outputs. This
  # will give us one row of the Jacobian matrix.
  for row in range(x_size):
    x_pos = x_data.copy()
    x_neg = x_data.copy()
    x_pos.ravel().view(x_dtype)[row] += delta
    y_pos = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_pos}))
    x_neg.ravel().view(x_dtype)[row] -= delta
    y_neg = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_neg}))
    diff = (y_pos - y_neg) / scale
    jacobian[row, :] = diff.ravel().view(y_dtype)

  logging.vlog(1, "Numeric Jacobian =\n%s", jacobian)
  return jacobian
 def testGradient(self):
   with self.test_session():
     x = constant_op.constant(
         [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
         shape=[2, 5],
         name="x")
     y = nn_ops.softplus(x, name="softplus")
     x_init = np.asarray(
         [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
         dtype=np.float32,
         order="F")
     err = gradient_checker.compute_gradient_error(
         x, [2, 5], y, [2, 5], x_init_value=x_init)
   logging.vlog(2, "softplus (float) gradient err = ", err)
   self.assertLess(err, 1e-4)
def _stride_size(node):
  """Computes stride size given a TF node.

  Args:
    node: Tensorflow node (NodeDef proto).

  Returns:
    stride_x: Stride size for horizontal direction (integer).
    stride_y: Stride size for vertical direction (integer).
  """
  strides_attr = node.attr["strides"]
  logging.vlog(4, "strides_attr = %s", strides_attr)
  stride_y = strides_attr.list.i[1]
  stride_x = strides_attr.list.i[2]
  return stride_x, stride_y
Beispiel #22
0
  def _show_compute(self, show_dataflow):
    """Visualize the computation activity."""
    for dev_stats in self._step_stats.dev_stats:
      device_name = dev_stats.device
      device_pid = self._device_pids[device_name]
      is_gputrace = self._is_gputrace_device(device_name)

      for node_stats in dev_stats.node_stats:
        tid = node_stats.thread_id
        start_time = node_stats.all_start_micros
        end_time = node_stats.all_start_micros + node_stats.all_end_rel_micros
        self._emit_op(node_stats, device_pid, is_gputrace)

        if is_gputrace:
          continue

        _, _, inputs = self._parse_op_label(node_stats.timeline_label)
        for input_name in inputs:
          if input_name not in self._tensors:
            # This can happen when partitioning has inserted a Send/Recv.
            # We remove the numeric suffix so that the dataflow appears to
            # come from the original node.  Ideally, the StepStats would
            # contain logging for the Send and Recv nodes.
            index = input_name.rfind('/_')
            if index > 0:
              input_name = input_name[:index]

          if input_name in self._tensors:
            tensor = self._tensors[input_name]
            tensor.add_ref(start_time)
            tensor.add_unref(end_time - 1)

            if show_dataflow:
              # We use a different flow ID for every graph edge.
              create_time, create_pid, create_tid = self._flow_starts[
                  input_name]
              # Don't add flows when producer and consumer ops are on the same
              # pid/tid since the horizontal arrows clutter the visualization.
              if create_pid != device_pid or create_tid != tid:
                flow_id = self._alloc_flow_id()
                self._chrome_trace.emit_flow_start(input_name, create_time,
                                                   create_pid, create_tid,
                                                   flow_id)
                self._chrome_trace.emit_flow_end(input_name, start_time,
                                                 device_pid, tid, flow_id)
          else:
            logging.vlog(1, 'Can\'t find tensor %s - removed by CSE?',
                         input_name)
  def _run(self, sess, enqueue_op, feed_fn, coord=None):
    """Execute the enqueue op in a loop, close the queue in case of error.

    Args:
      sess: A `Session`.
      enqueue_op: The `Operation` to run.
      feed_fn: the feed function to pass to `sess.run`.
      coord: Optional `Coordinator` object for reporting errors and checking
        for stop conditions.

    """
    # TODO(jamieas): Reduce code duplication with `QueueRunner`.
    if coord:
      coord.register_thread(threading.current_thread())
    decremented = False
    try:
      while True:
        if coord and coord.should_stop():
          break
        try:
          feed_dict = None if feed_fn is None else feed_fn()
          sess.run(enqueue_op, feed_dict=feed_dict)
        except (errors.OutOfRangeError, errors.CancelledError):
          # This exception indicates that a queue was closed.
          with self._lock:
            self._runs_per_session[sess] -= 1
            decremented = True
            if self._runs_per_session[sess] == 0:
              try:
                sess.run(self._close_op)
              except Exception as e:
                # Intentionally ignore errors from close_op.
                logging.vlog(1, "Ignored exception: %s", str(e))
            return
    except Exception as e:
      # This catches all other exceptions.
      if coord:
        coord.request_stop(e)
      else:
        logging.error("Exception in QueueRunner: %s", str(e))
        with self._lock:
          self._exceptions_raised.append(e)
        raise
    finally:
      # Make sure we account for all terminations: normal or errors.
      if not decremented:
        with self._lock:
          self._runs_per_session[sess] -= 1
  def _run(self, sess, enqueue_op, coord=None):
    """Execute the enqueue op in a loop, close the queue in case of error.

    Args:
      sess: A Session.
      enqueue_op: The Operation to run.
      coord: Optional Coordinator object for reporting errors and checking
        for stop conditions.
    """
    decremented = False
    try:
      # Make a cached callable from the `enqueue_op` to decrease the
      # Python overhead in the queue-runner loop.
      enqueue_callable = sess.make_callable(enqueue_op)
      while True:
        if coord and coord.should_stop():
          break
        try:
          enqueue_callable()
        except self._queue_closed_exception_types:  # pylint: disable=catching-non-exception
          # This exception indicates that a queue was closed.
          with self._lock:
            self._runs_per_session[sess] -= 1
            decremented = True
            if self._runs_per_session[sess] == 0:
              try:
                sess.run(self._close_op)
              except Exception as e:
                # Intentionally ignore errors from close_op.
                logging.vlog(1, "Ignored exception: %s", str(e))
            return
    except Exception as e:
      # This catches all other exceptions.
      if coord:
        coord.request_stop(e)
      else:
        logging.error("Exception in QueueRunner: %s", str(e))
        with self._lock:
          self._exceptions_raised.append(e)
        raise
    finally:
      # Make sure we account for all terminations: normal or errors.
      if not decremented:
        with self._lock:
          self._runs_per_session[sess] -= 1
Beispiel #25
0
  def _run(self, sess, enqueue_op, coord=None):
    """Execute the enqueue op in a loop, close the queue in case of error.

    Args:
      sess: A Session.
      enqueue_op: The Operation to run.
      coord: Optional Coordinator object for reporting errors and checking
        for stop conditions.
    """
    if coord:
      coord.register_thread(threading.current_thread())
    decremented = False
    try:
      while True:
        if coord and coord.should_stop():
          break
        try:
          sess.run(enqueue_op)
        except errors.OutOfRangeError:
          # This exception indicates that a queue was closed.
          with self._lock:
            self._runs -= 1
            decremented = True
            if self._runs == 0:
              try:
                sess.run(self._close_op)
              except Exception as e:
                # Intentionally ignore errors from close_op.
                logging.vlog(1, "Ignored exception: %s", str(e))
            return
    except Exception as e:
      # This catches all other exceptions.
      if coord:
        coord.request_stop(e)
      else:
        logging.error("Exception in QueueRunner: %s", str(e))
        with self._lock:
          self._exceptions_raised.append(e)
        raise
    finally:
      # Make sure we account for all terminations: normal or errors.
      if not decremented:
        with self._lock:
          self._runs -= 1
 def testGradientsEmbeddingLookup(self):
   vocab_size = 9
   num_ids = 10
   id_vals = list(np.random.randint(vocab_size, size=num_ids))
   tf_logging.vlog(1, id_vals)
   for ids_shape in [(10,), (2, 5)]:
     for num_shards in [1, 3]:
       with self.test_session():
         ids = constant_op.constant(
             id_vals, shape=ids_shape, dtype=dtypes.int32)
         x, params, _ = _EmbeddingParams(num_shards, vocab_size, shape=[2])
         y = embedding_ops.embedding_lookup(x, ids)
         y_shape = [num_ids] + list(params[_PName(0) + ":0"].shape[1:])
         x_name = [_PName(i) for i in range(num_shards)]
         x_init_value = [params[x_n + ":0"] for x_n in x_name]
         x_shape = [i.shape for i in x_init_value]
         err = gradient_checker.compute_gradient_error(
             x, x_shape, y, y_shape, x_init_value=x_init_value)
       self.assertLess(err, 1e-4)
 def testGradientsEmbeddingLookupWithComputedParams(self):
   vocab_size = 9
   num_ids = 5
   id_vals = list(np.random.randint(vocab_size, size=num_ids))
   tf_logging.vlog(1, id_vals)
   for num_shards in [1, 3]:
     with self.test_session():
       ids = constant_op.constant(id_vals, dtype=dtypes.int32)
       x, params, _ = _EmbeddingParams(num_shards, vocab_size, shape=[2])
       # This will force a conversion from IndexedSlices to Tensor.
       x_squared = [math_ops.square(elem) for elem in x]
       y = embedding_ops.embedding_lookup(x_squared, ids)
       y_shape = [num_ids] + list(params[_PName(0) + ":0"].shape[1:])
       x_name = [_PName(i) for i in range(num_shards)]
       x_init_value = [params[x_n + ":0"] for x_n in x_name]
       x_shape = [i.shape for i in x_init_value]
       err = gradient_checker.compute_gradient_error(
           x, x_shape, y, y_shape, x_init_value=x_init_value)
     self.assertLess(err, 1e-3)
Beispiel #28
0
  def benchmarkSamplingMVNFull(self):
    logging.vlog(
        2, "mvn_full\tuse_gpu\tcomponents\tbatch\tfeatures\tsample\twall_time")

    def psd(x):
      """Construct batch-wise PSD matrices."""
      return np.stack([np.dot(np.transpose(z), z) for z in x])

    def create_distribution(batch_size, num_components, num_features):
      cat = ds.Categorical(
          logits=np.random.randn(batch_size, num_components))
      mus = [
          variables.Variable(np.random.randn(batch_size, num_features))
          for _ in range(num_components)
      ]
      sigmas = [
          variables.Variable(
              psd(np.random.rand(batch_size, num_features, num_features)))
          for _ in range(num_components)
      ]
      components = list(
          ds.MultivariateNormalTriL(
              loc=mu, scale_tril=linalg_ops.cholesky(sigma))
          for (mu, sigma) in zip(mus, sigmas))
      return ds.Mixture(cat, components, use_static_graph=self.use_static_graph)

    for use_gpu in False, True:
      if use_gpu and not test.is_gpu_available():
        continue
      for num_components in 1, 8, 16:
        for batch_size in 1, 32:
          for num_features in 1, 64, 512:
            for sample_size in 1, 32, 128:
              self._runSamplingBenchmark(
                  "mvn_full",
                  create_distribution=create_distribution,
                  use_gpu=use_gpu,
                  num_components=num_components,
                  batch_size=batch_size,
                  num_features=num_features,
                  sample_size=sample_size)
Beispiel #29
0
  def testNanFromGradsDontPropagate(self):
    """Test that update with NaN gradients does not cause NaN in results."""
    def _nan_log_prob_with_nan_gradient(x):
      return np.nan * math_ops.reduce_sum(x)

    with self.test_session() as sess:
      initial_x = math_ops.linspace(0.01, 5, 10)
      updated_x, kernel_results = hmc.kernel(
          target_log_prob_fn=_nan_log_prob_with_nan_gradient,
          current_state=initial_x,
          step_size=2.,
          num_leapfrog_steps=5,
          seed=47)
      initial_x_, updated_x_, acceptance_probs_ = sess.run(
          [initial_x, updated_x, kernel_results.acceptance_probs])

      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
      logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_))

      self.assertAllEqual(initial_x_, updated_x_)
      self.assertEqual(acceptance_probs_, 0.)

      self.assertAllFinite(
          gradients_ops.gradients(updated_x, initial_x)[0].eval())
      self.assertAllEqual([True], [g is None for g in gradients_ops.gradients(
          kernel_results.proposed_grads_target_log_prob, initial_x)])
      self.assertAllEqual([False], [g is None for g in gradients_ops.gradients(
          kernel_results.proposed_grads_target_log_prob,
          kernel_results.proposed_state)])
Beispiel #30
0
  def testNanRejection(self):
    """Tests that an update that yields NaN potentials gets rejected.

    We run HMC with a target distribution that returns NaN
    log-likelihoods if any element of x < 0, and unit-scale
    exponential log-likelihoods otherwise. The exponential potential
    pushes x towards 0, ensuring that any reasonably large update will
    push us over the edge into NaN territory.
    """
    def _unbounded_exponential_log_prob(x):
      """An exponential distribution with log-likelihood NaN for x < 0."""
      per_element_potentials = array_ops.where(
          x < 0.,
          array_ops.fill(array_ops.shape(x), x.dtype.as_numpy_dtype(np.nan)),
          -x)
      return math_ops.reduce_sum(per_element_potentials)

    with self.test_session() as sess:
      initial_x = math_ops.linspace(0.01, 5, 10)
      updated_x, kernel_results = hmc.kernel(
          target_log_prob_fn=_unbounded_exponential_log_prob,
          current_state=initial_x,
          step_size=2.,
          num_leapfrog_steps=5,
          seed=46)
      initial_x_, updated_x_, acceptance_probs_ = sess.run(
          [initial_x, updated_x, kernel_results.acceptance_probs])

      logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
      logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
      logging_ops.vlog(1, "acceptance_probs = {}".format(acceptance_probs_))

      self.assertAllEqual(initial_x_, updated_x_)
      self.assertEqual(acceptance_probs_, 0.)
Beispiel #31
0
def _LogOpGradients(op, out_grads, in_grads):
    """Log the in and out grads of an op."""
    logging.vlog(1, "Gradient for '" + op.name + "'")

    def _FilterGrad(x):
        if x is None:
            return False
        if isinstance(x, (list, tuple)):
            return bool(x)
        else:
            return True

    logging.vlog(1, "  in  --> %s",
                 ", ".join(x.name for x in out_grads if _FilterGrad(x)))
    logging.vlog(1, "  out --> %s",
                 ", ".join(x.name for x in in_grads if _FilterGrad(x)))
Beispiel #32
0
    def testNanRejection(self):
        """Tests that an update that yields NaN potentials gets rejected.

    We run HMC with a target distribution that returns NaN
    log-likelihoods if any element of x < 0, and unit-scale
    exponential log-likelihoods otherwise. The exponential potential
    pushes x towards 0, ensuring that any reasonably large update will
    push us over the edge into NaN territory.
    """
        def _unbounded_exponential_log_prob(x):
            """An exponential distribution with log-likelihood NaN for x < 0."""
            per_element_potentials = array_ops.where(
                x < 0.,
                array_ops.fill(array_ops.shape(x),
                               x.dtype.as_numpy_dtype(np.nan)), -x)
            return math_ops.reduce_sum(per_element_potentials)

        with self.test_session(graph=ops.Graph()) as sess:
            initial_x = math_ops.linspace(0.01, 5, 10)
            updated_x, kernel_results = hmc.kernel(
                target_log_prob_fn=_unbounded_exponential_log_prob,
                current_state=initial_x,
                step_size=2.,
                num_leapfrog_steps=5,
                seed=46)
            initial_x_, updated_x_, log_accept_ratio_ = sess.run(
                [initial_x, updated_x, kernel_results.log_accept_ratio])
            acceptance_probs = np.exp(np.minimum(log_accept_ratio_, 0.))

            logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
            logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
            logging_ops.vlog(1,
                             "log_accept_ratio = {}".format(log_accept_ratio_))

            self.assertAllEqual(initial_x_, updated_x_)
            self.assertEqual(acceptance_probs, 0.)
Beispiel #33
0
    def testNanFromGradsDontPropagate(self):
        """Test that update with NaN gradients does not cause NaN in results."""
        def _nan_log_prob_with_nan_gradient(x):
            return np.nan * math_ops.reduce_sum(x)

        with self.test_session(graph=ops.Graph()) as sess:
            initial_x = math_ops.linspace(0.01, 5, 10)
            updated_x, kernel_results = hmc.kernel(
                target_log_prob_fn=_nan_log_prob_with_nan_gradient,
                current_state=initial_x,
                step_size=2.,
                num_leapfrog_steps=5,
                seed=47)
            initial_x_, updated_x_, log_accept_ratio_ = sess.run(
                [initial_x, updated_x, kernel_results.log_accept_ratio])
            acceptance_probs = np.exp(np.minimum(log_accept_ratio_, 0.))

            logging_ops.vlog(1, "initial_x = {}".format(initial_x_))
            logging_ops.vlog(1, "updated_x = {}".format(updated_x_))
            logging_ops.vlog(1,
                             "log_accept_ratio = {}".format(log_accept_ratio_))

            self.assertAllEqual(initial_x_, updated_x_)
            self.assertEqual(acceptance_probs, 0.)

            self.assertAllFinite(
                gradients_ops.gradients(updated_x, initial_x)[0].eval())
            self.assertAllEqual([True], [
                g is None for g in gradients_ops.gradients(
                    kernel_results.proposed_grads_target_log_prob, initial_x)
            ])
            self.assertAllEqual([False], [
                g is None for g in gradients_ops.gradients(
                    kernel_results.proposed_grads_target_log_prob,
                    kernel_results.proposed_state)
            ])
Beispiel #34
0
def gradients(ys,
              xs,
              grad_ys=None,
              name="gradients",
              colocate_gradients_with_ops=False,
              gate_gradients=False,
              aggregation_method=None):
    """Constructs symbolic partial derivatives of sum of `ys` w.r.t. x in `xs`.

  `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
  is a list of `Tensor`, holding the gradients received by the
  `ys`. The list must be the same length as `ys`.

  `gradients()` adds ops to the graph to output the partial
  derivatives of `ys` with respect to `xs`.  It returns a list of
  `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)`
  for y in `ys`.

  `grad_ys` is a list of tensors of the same length as `ys` that holds
  the initial gradients for each y in `ys`.  When `grad_ys` is None,
  we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
  user can provide their own initial `grad_ys` to compute the
  derivatives using a different initial gradient for each y (e.g., if
  one wanted to weight the gradient differently for each value in
  each y).

  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    grad_ys: Optional. A `Tensor` or list of tensors the same size as
      `ys` and holding the gradients computed for each y in `ys`.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'gradients'.
    colocate_gradients_with_ops: If True, try colocating gradients with
      the corresponding op.
    gate_gradients: If True, add a tuple around the gradients returned
      for an operations.  This avoids some race conditions.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.

  Returns:
    A list of `sum(dy/dx)` for each x in `xs`.

  Raises:
    LookupError: if one of the operations between `x` and `y` does not
      have a registered gradient function.
    ValueError: if the arguments are invalid.

  """
    ys = _AsList(ys)
    xs = _AsList(xs)
    if grad_ys is None:
        grad_ys = [None] * len(ys)
    else:
        grad_ys = _AsList(grad_ys)
    with ops.op_scope(ys + xs + grad_ys, name, "gradients"):
        ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
        xs = ops.convert_n_to_tensor_or_indexed_slices(xs, name="x")
        grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)

        # The approach we take here is as follows: Create a list of all ops in the
        # subgraph between the ys and xs.  Visit these ops in reverse order of ids
        # to ensure that when we visit an op the gradients w.r.t its outputs have
        # been collected.  Then aggregate these gradients if needed, call the op's
        # gradient function, and add the generated gradients to the gradients for
        # its input.

        # Initialize the pending count for ops in the connected subgraph from ys
        # to the xs.
        to_ops = [t.op for t in ys]
        from_ops = [t.op for t in xs]
        pending_count, loop_state = _PendingCount(ops.get_default_graph(),
                                                  to_ops, from_ops)

        # Iterate over the collected ops.
        #
        # grads: op => list of gradients received on each output endpoint of the
        # op.  The gradients for each endpoint are initially collected as a list.
        # When it is time to call the op's gradient function, for each endpoint we
        # aggregate the list of received gradients into a Add() Operation if there
        # is more than one.
        grads = {}

        # Add the initial gradients for the ys.
        for y, grad_y in zip(ys, grad_ys):
            _SetGrad(grads, y, grad_y)

        # Initialize queue with to_ops.
        queue = collections.deque()
        # Add the ops in 'to_ops' into the queue.
        to_ops_set = set()
        for op in to_ops:
            # 'ready' handles the case where one output gradient relies on
            # another output's gradient.
            # pylint: disable=protected-access
            ready = (pending_count[op._id] == 0)
            if ready and op._id not in to_ops_set:
                to_ops_set.add(op._id)
                queue.append(op)

        if loop_state:
            # The "unused" exits of the loops are added to ys. As an example,
            # people often write:
            #         v1, _ = While(p, b, [x1, x2])
            #         result = gradients(v1, x1)
            # The exit node of x2 is not included by the betweenness analysis.
            # But we need it if x2 is involved in computing v1. So we add it
            # back in backprop with a zeros_like gradient.
            loop_exits = loop_state.GetAllLoopExits()
            for y in loop_exits:
                if pending_count[y.op._id] == 0 and y.op._id not in to_ops_set:
                    if _IsFloat(y):
                        # Floating-point outputs get a zero gradient.
                        _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
                    queue.append(y.op)

        # The set of 'from_ops'.
        stop_ops = _StopOps(from_ops, pending_count)
        while queue:
            # generate gradient subgraph for op.
            op = queue.popleft()
            with _maybe_colocate_with(op, colocate_gradients_with_ops):
                if loop_state:
                    loop_state.EnterGradWhileContext(op, before=True)
                out_grads = _AggregatedGrads(grads, op, loop_state,
                                             aggregation_method)
                if loop_state:
                    loop_state.ExitGradWhileContext(op, before=True)

                grad_fn = None
                # pylint: disable=protected-access
                is_func_call = ops.get_default_graph()._is_function(op.type)
                if not is_func_call and any(
                        isinstance(g, ops.Tensor) or g
                        for g in out_grads) and (op._id not in stop_ops):
                    # pylint: enable=protected-access
                    # A grad_fn must be defined, either as a function or as None
                    # for ops that do not have gradients.
                    try:
                        grad_fn = ops.get_gradient_function(op)
                    except LookupError:
                        raise LookupError(
                            "No gradient defined for operation '%s' (op type: %s)"
                            % (op.name, op.type))

                if loop_state:
                    loop_state.EnterGradWhileContext(op, before=False)
                if (grad_fn or is_func_call) and any(
                        isinstance(g, ops.Tensor) or g for g in out_grads):
                    # NOTE: If _AggregatedGrads didn't compute a value for the i'th
                    # output, it means that the cost does not depend on output[i],
                    # therefore dC/doutput[i] is 0.
                    for i, out_grad in enumerate(out_grads):
                        if (not isinstance(out_grad, ops.Tensor)
                                and not out_grad) and _IsFloat(op.outputs[i]):
                            # Only floating-point outputs get a zero gradient. Gradient
                            # functions should ignore the gradient for other outputs.
                            if loop_state:
                                out_grads[i] = loop_state.ZerosLike(op, i)
                            else:
                                out_grads[
                                    i] = control_flow_ops.ZerosLikeOutsideLoop(
                                        op, i)
                    with ops.name_scope(op.name + "_grad"):
                        # pylint: disable=protected-access
                        with ops.get_default_graph()._original_op(op):
                            # pylint: enable=protected-access
                            if is_func_call:
                                # For function call ops, we add a 'SymbolicGradient'
                                # node to the graph to compute gradients.
                                f_in = [x for x in op.inputs] + out_grads
                                f_types = [x.dtype for x in op.inputs]
                                # pylint: disable=protected-access
                                in_grads = _AsList(
                                    functional_ops._symbolic_gradient(
                                        f_in, f_types, op.type))
                                # pylint: enable=protected-access
                            else:
                                in_grads = _AsList(grad_fn(op, *out_grads))
                            _VerifyGeneratedGradients(in_grads, op)
                            if gate_gradients and len(
                                [x for x in in_grads if x is not None]) > 1:
                                in_grads = control_flow_ops.tuple(in_grads)
                    logging.vlog(1, "Gradient for '" + op.name + "'")

                    def _FilterGrad(x):
                        if x is None:
                            return False
                        if isinstance(x, (list, tuple)):
                            return bool(x)
                        else:
                            return True

                    logging.vlog(
                        1, "  in  --> %s", ", ".join(
                            [x.name for x in out_grads if _FilterGrad(x)]))
                    logging.vlog(
                        1, "  out --> %s",
                        ", ".join([x.name for x in in_grads
                                   if _FilterGrad(x)]))
                else:
                    # If no grad_fn is defined or none of out_grads is available,
                    # just propagates a list of None backwards.
                    in_grads = [None] * len(op.inputs)
                for t_in, in_grad in zip(op.inputs, in_grads):
                    if in_grad is not None:
                        if isinstance(in_grad, ops.Tensor):
                            in_grad.set_shape(t_in.get_shape())
                        _SetGrad(grads, t_in, in_grad)
                if loop_state:
                    loop_state.ExitGradWhileContext(op, before=False)

            # update pending count for the inputs of op.
            # pylint: disable=protected-access
            for x in op.inputs:
                pending_count[x.op._id] -= 1
                ready = (pending_count[x.op._id] == 0)
                if loop_state and not ready:
                    ready = (pending_count[x.op._id] > 0
                             and control_flow_ops.IsLoopSwitch(x.op))
                if ready:
                    queue.append(x.op)
            for x in op.control_inputs:
                pending_count[x._id] -= 1
                if pending_count[x._id] is 0:
                    queue.append(x)
            # pylint: enable=protected-access
    #print("===============")
    #print()
    #ggg = [_GetGrad(grads, x) for x in xs]
    #print(ggg)
    return [_GetGrad(grads, x) for x in xs]
Beispiel #35
0
def dtensor_initialize_tpu_system(enable_coordination_service=False):
  """Initialize the TPU devices.

  Args:
    enable_coordination_service: If true, enable distributed coordination
      service to make sure that workers know the devices on each other, a
      prerequisite for data transfer through cross-worker rendezvous.

  Raises:
    RuntimeError: If running inside a tf.function.
    NotFoundError: If no TPU devices found in eager mode.
  """

  assert context.executing_eagerly()
  in_multi_client_mode = api.job_name() != "localhost"

  # Collective GRPC servers are only necessary in mutli-client setup.
  # Single clients (e.g. Forge) can use local mode of collectives.
  if in_multi_client_mode:
    if api.jobs() is None:
      raise ValueError(
          "DTENSOR_JOBS environment variable is required when"
          "using multi-client to properly set up communications between servers"
      )
    multi_client_util.initialize_multi_client_cluster(
        job_name=api.job_name(),
        dtensor_jobs=api.jobs(),
        client_id=api.client_id(),
        collective_leader=api.full_job_name(task_id=0),
        enable_coordination_service=enable_coordination_service)

  # Make sure the server change is fully propagated before attempting to run
  # the core ID merging logic below.
  context.ensure_initialized()
  context.async_wait()
  context.context()._clear_caches()  # pylint: disable=protected-access

  @function.defun
  def _tpu_init_fn():
    return gen_dtensor_ops.configure_and_initialize_global_tpu()

  try:
    with ops.device("/job:" + api.full_job_name() + "/device:TPU_SYSTEM:0"):  # pylint: disable=protected-access
      my_core_ids = _tpu_init_fn()
    logging.info("TPU core IDs: %s", my_core_ids)
    context.initialize_logical_devices()

    # Configure virtual CPUs that is 1:1 mapped to TPU cores.
    context.context().set_logical_cpu_devices(
        len(api.local_devices(_TPU_DEVICE_TYPE)),
        tf_device.DeviceSpec(
            job=api.job_name(), replica=0, task=api.client_id()).to_string())

    # `my_core_ids` contains the IDs of TPU cores attached to this host.
    #
    # To generate correct and efficient XLA AllReduce group assignment, we must
    # merge these arrays from all hosts and broadcast the result back to all
    # hosts, so all hosts can use these mappings in their MLIR passes.
    #
    # This is essentially doing what WaitForDistributedTpuOp and
    # SetGlobalTPUArrayOp do, in our multi-client environment.
    task_id = api.client_id()
    num_tasks = api.num_clients()
    num_devices = api.num_global_devices(_TPU_DEVICE_TYPE)
    num_devices_per_task = int(num_devices / num_tasks)

    # Create a one-time use mesh and layout just for merging core IDs.
    mesh = layout_lib.Mesh([_MESH_DIM_X],
                           *_create_device_array((num_devices,),
                                                 _TPU_DEVICE_TYPE,
                                                 api.client_id()))
    layout = layout_lib.Layout([_MESH_DIM_X, layout_lib.UNSHARDED], mesh)
    device = dtensor_device.DTensorDevice(meshes=[mesh])
    logging.info("TPU core locations: %s",
                 device.tpu_core_ids_to_locations(my_core_ids))

    # At this point, we don't know which cores are attached to other hosts.
    # The core ID mappings in the runtime haven't been set yet.
    #
    # The core ID merging AllReduce below is carefully written so it works
    # without needing correct core mappings to be set in the runtime. We will
    # use this AllReduce's result to set the core ID mappings, and all future
    # user-initiated AllReduces will use the mappings.
    #
    # The runtime is hard-coded to ignore core ID mappings on this AllReduce.
    all_core_ids = np.zeros([num_devices], dtype=np.int32)
    for i in range(len(my_core_ids)):
      all_core_ids[task_id * num_devices_per_task + i] = my_core_ids[i]

    # Only one local device gets valid input: 8 local core IDs among
    # (num_tasks - 1) * 8 zeros. The 8 core IDs are set using task ID as offset.
    # The other 7 local devices get zero inputs. All devices on all host
    # participate in one AllReduce, whose result will be core IDs arranged by
    # task-device ordinals.
    all_core_ids = constant_op.constant([all_core_ids])
    zeros = array_ops.zeros_like(all_core_ids)
    all_core_ids = [all_core_ids] + [zeros] * (num_devices_per_task - 1)

    with ops.device(device.name):
      all_core_ids = device.pack(all_core_ids, layout)
      all_core_ids = math_ops.reduce_sum(all_core_ids, axis=[0])
      unpacked_all_tpu_ids = device.unpack(all_core_ids)

    all_core_ids = list(unpacked_all_tpu_ids[0].numpy())
    logging.info("All TPU core IDs: %s", all_core_ids)

    # Set the default core ID mappings in the runtime for legacy code and tests.
    #
    # Legacy code and tests create TPU meshes directly without using the
    # `create_tpu_mesh` function below. Those meshes have global device IDs
    # equal to TF task-device ordinals. The `all_core_ids` array happens to
    # arrange core IDs by TF task-device ordinals. Using this array on those
    # meshes guarantee correct although inefficient results.
    device.set_tpu_core_ids("", all_core_ids)

    # Remember enough global, immutable information to be able to build any ring
    # we want prescribed by `create_tpu_mesh` in the future.
    global _all_core_ids
    _all_core_ids = all_core_ids

    all_core_locations = device.tpu_core_ids_to_locations(all_core_ids)
    all_core_locations = [
        _CoreLocation(l[0], l[1], l[2], l[3]) for l in all_core_locations
    ]
    global _all_core_locations
    _all_core_locations = all_core_locations
    logging.info("All TPU core locations: %s", all_core_locations)

    tpu_topology = _create_tpu_topology(all_core_locations, num_tasks,
                                        num_devices_per_task)
    global _tpu_topology
    _tpu_topology = tpu_topology
    logging.vlog(1, "TPU Topology: %s, %s", tpu_topology.mesh_shape,
                 tpu_topology.device_coordinates)

    global _dtensor_device
    _dtensor_device = device

    context.async_wait()

  except errors.InvalidArgumentError as e:
    raise errors.NotFoundError(
        None, None, "Initialization failed, no valid TPUs found. " + str(e))

  except errors.InternalError as e:
    logging.error("Hit internal error during TPU system initialization. "
                  + "It is likely hareware failure. \nPlease check the error "
                  + "messages above to see whether that's the case. \nIf so, "
                  + "consider to restart the job or try another machine.")
    raise e

  # Optionally exchange heartbeats between workers every minute.
  if in_multi_client_mode and api.heartbeat_enabled():
    logging.info(
        "Starting DTensor heartbeat service exchanging signals every 10 minutes"
    )
    heartbeat.start(period=180)

  # Clear out the eager context caches since the memory is invalid now.
  logging.info("Clearing out eager caches")
  context.context()._clear_caches()  # pylint: disable=protected-access
Beispiel #36
0
    def _kernel_leaves_target_invariant(self,
                                        initial_draws,
                                        independent_chain_ndims,
                                        sess,
                                        feed_dict=None):
        def log_gamma_log_prob(x):
            event_dims = math_ops.range(independent_chain_ndims,
                                        array_ops.rank(x))
            return self._log_gamma_log_prob(x, event_dims)

        def fake_log_prob(x):
            """Cooled version of the target distribution."""
            return 1.1 * log_gamma_log_prob(x)

        step_size = array_ops.placeholder(np.float32, [], name="step_size")

        if feed_dict is None:
            feed_dict = {}

        feed_dict[step_size] = 0.4

        sample, kernel_results = hmc.kernel(
            target_log_prob_fn=log_gamma_log_prob,
            current_state=initial_draws,
            step_size=step_size,
            num_leapfrog_steps=5,
            seed=43)

        bad_sample, bad_kernel_results = hmc.kernel(
            target_log_prob_fn=fake_log_prob,
            current_state=initial_draws,
            step_size=step_size,
            num_leapfrog_steps=5,
            seed=44)

        [
            acceptance_probs_,
            bad_acceptance_probs_,
            initial_draws_,
            updated_draws_,
            fake_draws_,
        ] = sess.run([
            kernel_results.acceptance_probs,
            bad_kernel_results.acceptance_probs,
            initial_draws,
            sample,
            bad_sample,
        ], feed_dict)

        # Confirm step size is small enough that we usually accept.
        self.assertGreater(acceptance_probs_.mean(), 0.5)
        self.assertGreater(bad_acceptance_probs_.mean(), 0.5)

        # Confirm step size is large enough that we sometimes reject.
        self.assertLess(acceptance_probs_.mean(), 0.99)
        self.assertLess(bad_acceptance_probs_.mean(), 0.99)

        _, ks_p_value_true = stats.ks_2samp(initial_draws_.flatten(),
                                            updated_draws_.flatten())
        _, ks_p_value_fake = stats.ks_2samp(initial_draws_.flatten(),
                                            fake_draws_.flatten())

        logging_ops.vlog(
            1, "acceptance rate for true target: {}".format(
                acceptance_probs_.mean()))
        logging_ops.vlog(
            1, "acceptance rate for fake target: {}".format(
                bad_acceptance_probs_.mean()))
        logging_ops.vlog(
            1, "K-S p-value for true target: {}".format(ks_p_value_true))
        logging_ops.vlog(
            1, "K-S p-value for fake target: {}".format(ks_p_value_fake))
        # Make sure that the MCMC update hasn't changed the empirical CDF much.
        self.assertGreater(ks_p_value_true, 1e-3)
        # Confirm that targeting the wrong distribution does
        # significantly change the empirical CDF.
        self.assertLess(ks_p_value_fake, 1e-6)
Beispiel #37
0
  def testSample(self):
    # TODO(jvdillon): This test should be the basis of a new test fixture which
    # is applied to every distribution. When we make this fixture, we'll also
    # separate the analytical- and sample-based tests as well as for each
    # function tested. For now, we group things so we can recycle one batch of
    # samples (thus saving resources).

    mu = np.array([-1., 1, 0.5], dtype=np.float32)
    diag_large = np.array([1., 0.5, 0.75], dtype=np.float32)
    diag_small = np.array([-1.1, 1.2], dtype=np.float32)
    v = np.array([[0.7, 0.8],
                  [0.9, 1],
                  [0.5, 0.6]], dtype=np.float32)  # shape: [k, r] = [3, 2]

    true_mean = mu
    true_scale = np.diag(diag_large) + np.matmul(np.matmul(
        v, np.diag(diag_small)), v.T)
    true_covariance = np.matmul(true_scale, true_scale.T)
    true_variance = np.diag(true_covariance)
    true_stddev = np.sqrt(true_variance)

    with self.test_session() as sess:
      dist = ds.MultivariateNormalDiagPlusLowRank(
          loc=mu,
          scale_diag=diag_large,
          scale_perturb_factor=v,
          scale_perturb_diag=diag_small,
          validate_args=True)

      # The following distributions will test the KL divergence calculation.
      mvn_identity = ds.MultivariateNormalDiag(
          loc=np.array([1., 2, 0.25], dtype=np.float32),
          validate_args=True)
      mvn_scaled = ds.MultivariateNormalDiag(
          loc=mvn_identity.loc,
          scale_identity_multiplier=2.2,
          validate_args=True)
      mvn_diag = ds.MultivariateNormalDiag(
          loc=mvn_identity.loc,
          scale_diag=np.array([0.5, 1.5, 1.], dtype=np.float32),
          validate_args=True)
      mvn_chol = ds.MultivariateNormalTriL(
          loc=np.array([1., 2, -1], dtype=np.float32),
          scale_tril=np.array([[6., 0, 0],
                               [2, 5, 0],
                               [1, 3, 4]], dtype=np.float32) / 10.,
          validate_args=True)

      scale = dist.scale.to_dense()

      n = int(30e3)
      samps = dist.sample(n, seed=0)
      sample_mean = math_ops.reduce_mean(samps, 0)
      x = samps - sample_mean
      sample_covariance = math_ops.matmul(x, x, transpose_a=True) / n

      sample_kl_identity = math_ops.reduce_mean(
          dist.log_prob(samps) - mvn_identity.log_prob(samps), 0)
      analytical_kl_identity = ds.kl_divergence(dist, mvn_identity)

      sample_kl_scaled = math_ops.reduce_mean(
          dist.log_prob(samps) - mvn_scaled.log_prob(samps), 0)
      analytical_kl_scaled = ds.kl_divergence(dist, mvn_scaled)

      sample_kl_diag = math_ops.reduce_mean(
          dist.log_prob(samps) - mvn_diag.log_prob(samps), 0)
      analytical_kl_diag = ds.kl_divergence(dist, mvn_diag)

      sample_kl_chol = math_ops.reduce_mean(
          dist.log_prob(samps) - mvn_chol.log_prob(samps), 0)
      analytical_kl_chol = ds.kl_divergence(dist, mvn_chol)

      n = int(10e3)
      baseline = ds.MultivariateNormalDiag(
          loc=np.array([-1., 0.25, 1.25], dtype=np.float32),
          scale_diag=np.array([1.5, 0.5, 1.], dtype=np.float32),
          validate_args=True)
      samps = baseline.sample(n, seed=0)

      sample_kl_identity_diag_baseline = math_ops.reduce_mean(
          baseline.log_prob(samps) - mvn_identity.log_prob(samps), 0)
      analytical_kl_identity_diag_baseline = ds.kl_divergence(
          baseline, mvn_identity)

      sample_kl_scaled_diag_baseline = math_ops.reduce_mean(
          baseline.log_prob(samps) - mvn_scaled.log_prob(samps), 0)
      analytical_kl_scaled_diag_baseline = ds.kl_divergence(
          baseline, mvn_scaled)

      sample_kl_diag_diag_baseline = math_ops.reduce_mean(
          baseline.log_prob(samps) - mvn_diag.log_prob(samps), 0)
      analytical_kl_diag_diag_baseline = ds.kl_divergence(baseline, mvn_diag)

      sample_kl_chol_diag_baseline = math_ops.reduce_mean(
          baseline.log_prob(samps) - mvn_chol.log_prob(samps), 0)
      analytical_kl_chol_diag_baseline = ds.kl_divergence(baseline, mvn_chol)

      [
          sample_mean_,
          analytical_mean_,
          sample_covariance_,
          analytical_covariance_,
          analytical_variance_,
          analytical_stddev_,
          scale_,
          sample_kl_identity_, analytical_kl_identity_,
          sample_kl_scaled_, analytical_kl_scaled_,
          sample_kl_diag_, analytical_kl_diag_,
          sample_kl_chol_, analytical_kl_chol_,
          sample_kl_identity_diag_baseline_,
          analytical_kl_identity_diag_baseline_,
          sample_kl_scaled_diag_baseline_, analytical_kl_scaled_diag_baseline_,
          sample_kl_diag_diag_baseline_, analytical_kl_diag_diag_baseline_,
          sample_kl_chol_diag_baseline_, analytical_kl_chol_diag_baseline_,
      ] = sess.run([
          sample_mean,
          dist.mean(),
          sample_covariance,
          dist.covariance(),
          dist.variance(),
          dist.stddev(),
          scale,
          sample_kl_identity, analytical_kl_identity,
          sample_kl_scaled, analytical_kl_scaled,
          sample_kl_diag, analytical_kl_diag,
          sample_kl_chol, analytical_kl_chol,
          sample_kl_identity_diag_baseline,
          analytical_kl_identity_diag_baseline,
          sample_kl_scaled_diag_baseline, analytical_kl_scaled_diag_baseline,
          sample_kl_diag_diag_baseline, analytical_kl_diag_diag_baseline,
          sample_kl_chol_diag_baseline, analytical_kl_chol_diag_baseline,
      ])

      sample_variance_ = np.diag(sample_covariance_)
      sample_stddev_ = np.sqrt(sample_variance_)

      logging.vlog(2, "true_mean:\n{}  ".format(true_mean))
      logging.vlog(2, "sample_mean:\n{}".format(sample_mean_))
      logging.vlog(2, "analytical_mean:\n{}".format(analytical_mean_))

      logging.vlog(2, "true_covariance:\n{}".format(true_covariance))
      logging.vlog(2, "sample_covariance:\n{}".format(sample_covariance_))
      logging.vlog(2, "analytical_covariance:\n{}".format(
          analytical_covariance_))

      logging.vlog(2, "true_variance:\n{}".format(true_variance))
      logging.vlog(2, "sample_variance:\n{}".format(sample_variance_))
      logging.vlog(2, "analytical_variance:\n{}".format(analytical_variance_))

      logging.vlog(2, "true_stddev:\n{}".format(true_stddev))
      logging.vlog(2, "sample_stddev:\n{}".format(sample_stddev_))
      logging.vlog(2, "analytical_stddev:\n{}".format(analytical_stddev_))

      logging.vlog(2, "true_scale:\n{}".format(true_scale))
      logging.vlog(2, "scale:\n{}".format(scale_))

      logging.vlog(2, "kl_identity:  analytical:{}  sample:{}".format(
          analytical_kl_identity_, sample_kl_identity_))

      logging.vlog(2, "kl_scaled:    analytical:{}  sample:{}".format(
          analytical_kl_scaled_, sample_kl_scaled_))

      logging.vlog(2, "kl_diag:      analytical:{}  sample:{}".format(
          analytical_kl_diag_, sample_kl_diag_))

      logging.vlog(2, "kl_chol:      analytical:{}  sample:{}".format(
          analytical_kl_chol_, sample_kl_chol_))

      logging.vlog(
          2, "kl_identity_diag_baseline:  analytical:{}  sample:{}".format(
              analytical_kl_identity_diag_baseline_,
              sample_kl_identity_diag_baseline_))

      logging.vlog(
          2, "kl_scaled_diag_baseline:  analytical:{}  sample:{}".format(
              analytical_kl_scaled_diag_baseline_,
              sample_kl_scaled_diag_baseline_))

      logging.vlog(2, "kl_diag_diag_baseline:  analytical:{}  sample:{}".format(
          analytical_kl_diag_diag_baseline_,
          sample_kl_diag_diag_baseline_))

      logging.vlog(2, "kl_chol_diag_baseline:  analytical:{}  sample:{}".format(
          analytical_kl_chol_diag_baseline_,
          sample_kl_chol_diag_baseline_))

      self.assertAllClose(true_mean, sample_mean_,
                          atol=0., rtol=0.02)
      self.assertAllClose(true_mean, analytical_mean_,
                          atol=0., rtol=1e-6)

      self.assertAllClose(true_covariance, sample_covariance_,
                          atol=0., rtol=0.02)
      self.assertAllClose(true_covariance, analytical_covariance_,
                          atol=0., rtol=1e-6)

      self.assertAllClose(true_variance, sample_variance_,
                          atol=0., rtol=0.02)
      self.assertAllClose(true_variance, analytical_variance_,
                          atol=0., rtol=1e-6)

      self.assertAllClose(true_stddev, sample_stddev_,
                          atol=0., rtol=0.02)
      self.assertAllClose(true_stddev, analytical_stddev_,
                          atol=0., rtol=1e-6)

      self.assertAllClose(true_scale, scale_,
                          atol=0., rtol=1e-6)

      self.assertAllClose(sample_kl_identity_, analytical_kl_identity_,
                          atol=0., rtol=0.02)
      self.assertAllClose(sample_kl_scaled_, analytical_kl_scaled_,
                          atol=0., rtol=0.02)
      self.assertAllClose(sample_kl_diag_, analytical_kl_diag_,
                          atol=0., rtol=0.02)
      self.assertAllClose(sample_kl_chol_, analytical_kl_chol_,
                          atol=0., rtol=0.02)

      self.assertAllClose(
          sample_kl_identity_diag_baseline_,
          analytical_kl_identity_diag_baseline_,
          atol=0., rtol=0.02)
      self.assertAllClose(
          sample_kl_scaled_diag_baseline_,
          analytical_kl_scaled_diag_baseline_,
          atol=0., rtol=0.02)
      self.assertAllClose(
          sample_kl_diag_diag_baseline_,
          analytical_kl_diag_diag_baseline_,
          atol=0., rtol=0.04)
      self.assertAllClose(
          sample_kl_chol_diag_baseline_,
          analytical_kl_chol_diag_baseline_,
          atol=0., rtol=0.02)
Beispiel #38
0
def _compute_theoretical_jacobian(x, x_shape, x_data, dy, dy_shape, dx,
                                  extra_feed_dict):
  """Computes the theoretical Jacobian for dy/dx.

  Computes the theoretical Jacobian using the ops generated by
  compute_gradient().

  Args:
    x: the tensor "x".
    x_shape: the dimensions of x as a tuple or an array of ints.
    x_data: a numpy parray as the input data for x
    dy: the tensor "dy".
    dy_shape: the dimensions of dy as a tuple or an array of ints.
    dx: Tensor or IndexedSlices representing dx
    extra_feed_dict: dict that allows fixing specified tensor values
      during the jacobian calculation.

  Returns:
    A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows
    and "dy_size" columns where "x_size" is the number of elements in x and
    "dy_size" is the number of elements in dy.

  Raises:
    ValueError: If `dy` is empty but the gradient is nonzero.
  """
  # Complex vectors are treated as vectors of twice as many reals.
  if x.dtype.is_complex:
    x_shape = tuple(x_shape) + (2,)
  dy_factor = 2 if dy.dtype.is_complex else 1

  # To compute the jacobian, we treat x and y as one-dimensional vectors.
  x_size = _product(x_shape)
  x_val_size = _product(x_shape[1:])  # This is used for sparse gradients
  dy_size = _product(dy_shape) * dy_factor

  # Allocate 2-D Jacobian, with x dimensions smashed into the first
  # dimension and y dimensions smashed into the second.
  jacobian = np.zeros((x_size, dy_size),
                      dtype=x.dtype.real_dtype.as_numpy_dtype)

  # For each of the entry of dy, we set this to be 1 and
  # everything else to be 0 and compute the backprop -- this will give us one
  # one column of the Jacobian matrix.
  dy_data = np.zeros(dy_shape, dtype=dy.dtype.as_numpy_dtype)
  dy_data_flat = dy_data.ravel().view(dy.dtype.real_dtype.as_numpy_dtype)
  sess = ops.get_default_session()
  for col in range(dy_size):
    dy_data_flat[col] = 1
    if isinstance(dx, ops.IndexedSlices):
      backprop_indices, backprop_values = sess.run(
          [dx.indices, dx.values],
          feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data}))
      for i, v in zip(backprop_indices, backprop_values):
        r_begin = i * x_val_size
        r_end = r_begin + x_val_size
        jacobian[r_begin:r_end, col] += v.flat
    else:
      assert isinstance(dx, ops.Tensor), "dx = " + str(dx)
      backprop = sess.run(
          dx, feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data}))
      jacobian[:, col] = backprop.ravel().view(jacobian.dtype)
    dy_data_flat[col] = 0

  # If the output is empty, run the gradients at least once and make sure
  # they produce zeros.
  if not dy_size:
    backprop = sess.run(
        dx, feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data}))
    if backprop.shape != x_data.shape:
      raise ValueError("Empty gradient has wrong shape: expected %s, got %s" %
                       (x_data.shape, backprop.shape))
    if np.any(backprop):
      raise ValueError("Empty tensor with nonzero gradients")

  logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian)
  return jacobian
Beispiel #39
0
def compute_receptive_field_from_graph_def(graph_def,
                                           input_node,
                                           output_node,
                                           stop_propagation=None,
                                           input_resolution=None):
    """Computes receptive field (RF) parameters from a Graph or GraphDef object.

  The algorithm stops the calculation of the receptive field whenever it
  encounters an operation in the list `stop_propagation`. Stopping the
  calculation early can be useful to calculate the receptive field of a
  subgraph such as a single branch of the
  [inception network](https://arxiv.org/abs/1512.00567).

  Args:
    graph_def: Graph or GraphDef object.
    input_node: Name of the input node or Tensor object from graph.
    output_node: Name of the output node or Tensor object from graph.
    stop_propagation: List of operations or scope names for which to stop the
      propagation of the receptive field.
    input_resolution: 2D list. If the input resolution to the model is fixed and
      known, this may be set. This is helpful for cases where the RF parameters
      vary depending on the input resolution (this happens since SAME padding in
      tensorflow depends on input resolution in general). If this is None, it is
      assumed that the input resolution is unknown, so some RF parameters may be
      unknown (depending on the model architecture).

  Returns:
    rf_size_x: Receptive field size of network in the horizontal direction, with
      respect to specified input and output.
    rf_size_y: Receptive field size of network in the vertical direction, with
      respect to specified input and output.
    effective_stride_x: Effective stride of network in the horizontal direction,
      with respect to specified input and output.
    effective_stride_y: Effective stride of network in the vertical direction,
      with respect to specified input and output.
    effective_padding_x: Effective padding of network in the horizontal
      direction, with respect to specified input and output.
    effective_padding_y: Effective padding of network in the vertical
      direction, with respect to specified input and output.

  Raises:
    ValueError: If network is not aligned or if either input or output nodes
      cannot be found. For network criterion alignment, see
      photos/vision/features/delf/g3doc/rf_computation.md
  """
    # Convert a graph to graph_def if necessary.
    if isinstance(graph_def, framework_ops.Graph):
        graph_def = graph_def.as_graph_def()

    # Convert tensors to names.
    if isinstance(input_node, framework_ops.Tensor):
        input_node = input_node.op.name
    if isinstance(output_node, framework_ops.Tensor):
        output_node = output_node.op.name

    stop_propagation = stop_propagation or []

    # Computes order of computation for a given graph.
    node_info, name_to_node = graph_compute_order.get_compute_order(
        graph_def=graph_def,
        input_node_name=input_node,
        input_node_size=input_resolution)

    # Sort in reverse topological order.
    ordered_node_info = sorted(node_info.items(), key=lambda x: -x[1].order)

    # Dictionaries to keep track of receptive field, effective stride and
    # effective padding of different nodes.
    rf_sizes_x = {}
    rf_sizes_y = {}
    effective_strides_x = {}
    effective_strides_y = {}
    effective_paddings_x = {}
    effective_paddings_y = {}

    # Initialize dicts for output_node.
    rf_sizes_x[output_node] = 1
    rf_sizes_y[output_node] = 1
    effective_strides_x[output_node] = 1
    effective_strides_y[output_node] = 1
    effective_paddings_x[output_node] = 0
    effective_paddings_y[output_node] = 0

    # Flag to denote if we found output node yet. If we have not, we skip nodes
    # until the output node is found.
    found_output_node = False

    # Flag to denote if padding is undefined. This happens when SAME padding mode
    # is used in conjunction with stride and kernel sizes which make it such that
    # the padding to be applied would depend on the input size. In this case,
    # alignment checks are skipped, and the effective padding is None.
    undefined_padding = False

    for _, (o, node, _, _) in ordered_node_info:
        if node:
            logging.vlog(3, "%10d %-100s %-20s" % (o, node.name[:90], node.op))
        else:
            continue

        # When we find input node, we can stop.
        if node.name == input_node:
            break

        # Loop until we find the output node. All nodes before finding the output
        # one are irrelevant, so they can be skipped.
        if not found_output_node:
            if node.name == output_node:
                found_output_node = True

        if found_output_node:
            if node.name not in rf_sizes_x:
                assert node.name not in rf_sizes_y, (
                    "Node %s is in rf_sizes_y, but "
                    "not in rf_sizes_x" % node.name)
                # In this case, node is not relevant since it's not part of the
                # computation we're interested in.
                logging.vlog(3, "Irrelevant node %s, skipping it...",
                             node.name)
                continue

            # Get params for this layer.
            (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
             padding_y, _, _) = parse_layer_parameters.get_layer_params(
                 node, name_to_node, node_info[node.name].input_size)
            logging.vlog(
                3, "kernel_size_x = %s, kernel_size_y = %s, "
                "stride_x = %s, stride_y = %s, "
                "padding_x = %s, padding_y = %s, input size = %s" %
                (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
                 padding_y, node_info[node.name].input_size))
            if padding_x is None or padding_y is None:
                undefined_padding = True

            # Get parameters at input of this layer which may or may not be propagated
            # to the input layers.
            rf_size_input_x = _get_rf_size_node_input(stride_x, kernel_size_x,
                                                      rf_sizes_x[node.name])
            rf_size_input_y = _get_rf_size_node_input(stride_y, kernel_size_y,
                                                      rf_sizes_y[node.name])
            effective_stride_input_x = _get_effective_stride_node_input(
                stride_x, effective_strides_x[node.name])
            effective_stride_input_y = _get_effective_stride_node_input(
                stride_y, effective_strides_y[node.name])
            if not undefined_padding:
                effective_padding_input_x = _get_effective_padding_node_input(
                    stride_x, padding_x, effective_paddings_x[node.name])
                effective_padding_input_y = _get_effective_padding_node_input(
                    stride_y, padding_y, effective_paddings_y[node.name])
            else:
                effective_padding_input_x = None
                effective_padding_input_y = None
            logging.vlog(
                4, "rf_size_input_x = %s, rf_size_input_y = %s, "
                "effective_stride_input_x = %s, effective_stride_input_y = %s, "
                "effective_padding_input_x = %s, effective_padding_input_y = %s"
                % (rf_size_input_x, rf_size_input_y, effective_stride_input_x,
                   effective_stride_input_y, effective_padding_input_x,
                   effective_padding_input_y))

            # Loop over this node's inputs and potentially propagate information down.
            for inp_name in node.input:
                # Stop the propagation of the receptive field.
                if any(inp_name.startswith(stop) for stop in stop_propagation):
                    logging.vlog(3, "Skipping explicitly ignored node %s.",
                                 inp_name)
                    continue

                logging.vlog(4, "inp_name = %s", inp_name)
                if inp_name.startswith("^"):
                    # The character "^" denotes a control dependency, so this input node
                    # can be safely ignored.
                    continue

                inp_node = name_to_node[inp_name]
                logging.vlog(4, "inp_node = \n%s", inp_node)
                if inp_name in rf_sizes_x:
                    assert inp_name in rf_sizes_y, (
                        "Node %s is in rf_sizes_x, but "
                        "not in rf_sizes_y" % inp_name)
                    logging.vlog(
                        4, "rf_sizes_x[inp_name] = %s,"
                        " rf_sizes_y[inp_name] = %s, "
                        "effective_strides_x[inp_name] = %s,"
                        " effective_strides_y[inp_name] = %s, "
                        "effective_paddings_x[inp_name] = %s,"
                        " effective_paddings_y[inp_name] = %s" %
                        (rf_sizes_x[inp_name], rf_sizes_y[inp_name],
                         effective_strides_x[inp_name],
                         effective_strides_y[inp_name],
                         effective_paddings_x[inp_name],
                         effective_paddings_y[inp_name]))
                    # This node was already discovered through a previous path, so we need
                    # to make sure that graph is aligned. This alignment check is skipped
                    # if the padding is not defined, since in this case alignment cannot
                    # be checked.
                    if not undefined_padding:
                        if effective_strides_x[
                                inp_name] != effective_stride_input_x:
                            raise ValueError(
                                "Graph is not aligned since effective stride from different "
                                "paths is different in horizontal direction")
                        if effective_strides_y[
                                inp_name] != effective_stride_input_y:
                            raise ValueError(
                                "Graph is not aligned since effective stride from different "
                                "paths is different in vertical direction")
                        if (rf_sizes_x[inp_name] -
                                1) / 2 - effective_paddings_x[inp_name] != (
                                    rf_size_input_x -
                                    1) / 2 - effective_padding_input_x:
                            raise ValueError(
                                "Graph is not aligned since center shift from different "
                                "paths is different in horizontal direction")
                        if (rf_sizes_y[inp_name] -
                                1) / 2 - effective_paddings_y[inp_name] != (
                                    rf_size_input_y -
                                    1) / 2 - effective_padding_input_y:
                            raise ValueError(
                                "Graph is not aligned since center shift from different "
                                "paths is different in vertical direction")
                    # Keep track of path with largest RF, for both directions.
                    if rf_sizes_x[inp_name] < rf_size_input_x:
                        rf_sizes_x[inp_name] = rf_size_input_x
                        effective_strides_x[
                            inp_name] = effective_stride_input_x
                        effective_paddings_x[
                            inp_name] = effective_padding_input_x
                    if rf_sizes_y[inp_name] < rf_size_input_y:
                        rf_sizes_y[inp_name] = rf_size_input_y
                        effective_strides_y[
                            inp_name] = effective_stride_input_y
                        effective_paddings_y[
                            inp_name] = effective_padding_input_y
                else:
                    assert inp_name not in rf_sizes_y, (
                        "Node %s is in rf_sizes_y, but "
                        "not in rf_sizes_x" % inp_name)
                    # In this case, it is the first time we encounter this node. So we
                    # propagate the RF parameters.
                    rf_sizes_x[inp_name] = rf_size_input_x
                    rf_sizes_y[inp_name] = rf_size_input_y
                    effective_strides_x[inp_name] = effective_stride_input_x
                    effective_strides_y[inp_name] = effective_stride_input_y
                    effective_paddings_x[inp_name] = effective_padding_input_x
                    effective_paddings_y[inp_name] = effective_padding_input_y

    if not found_output_node:
        raise ValueError("Output node was not found")
    if input_node not in rf_sizes_x:
        raise ValueError("Input node was not found")
    return ReceptiveField(
        (rf_sizes_x[input_node], rf_sizes_y[input_node]),
        (effective_strides_x[input_node], effective_strides_y[input_node]),
        (effective_paddings_x[input_node], effective_paddings_y[input_node]))
Beispiel #40
0
  def gradient(self,
               target,
               sources,
               output_gradients=None,
               unconnected_gradients=UnconnectedGradients.NONE):
    """Computes the gradient using operations recorded in context of this tape.

    Note: Unless you set `persistent=True` a GradientTape can only be used to
    compute one set of gradients (or jacobians).

    Args:
      target: a list or nested structure of Tensors or Variables to be
        differentiated.
      sources: a list or nested structure of Tensors or Variables. `target`
        will be differentiated against elements in `sources`.
      output_gradients: a list of gradients, one for each element of
        target. Defaults to None.
      unconnected_gradients: a value which can either hold 'none' or 'zero' and
        alters the value which will be returned if the target and sources are
        unconnected. The possible values and effects are detailed in
        'UnconnectedGradients' and it defaults to 'none'.

    Returns:
      a list or nested structure of Tensors (or IndexedSlices, or None),
      one for each element in `sources`. Returned structure is the same as
      the structure of `sources`.

    Raises:
      RuntimeError: If called on a used, non-persistent tape.
      RuntimeError: If called inside the context of the tape.
      ValueError: If the target is a variable or if unconnected gradients is
       called with an unknown value.
    """
    if self._tape is None:
      raise RuntimeError("A non-persistent GradientTape can only be used to"
                         "compute one set of gradients (or jacobians)")
    if self._recording:
      if not self._persistent:
        self._pop_tape()
      else:
        logging.log_first_n(
            logging.WARN, "Calling GradientTape.gradient on a persistent "
            "tape inside its context is significantly less "
            "efficient than calling it outside the context (it "
            "causes the gradient ops to be recorded on the "
            "tape, leading to increased CPU and memory usage). "
            "Only call GradientTape.gradient inside the "
            "context if you actually want to trace the "
            "gradient in order to compute higher order "
            "derivatives.", 1)

    num_ndarrays = 0
    flat_targets = []
    for t in nest.flatten(target):
      if not backprop_util.IsTrainable(t):
        logging.vlog(
            logging.WARN, "The dtype of the target tensor must be "
            "floating (e.g. tf.float32) when calling GradientTape.gradient, "
            "got %r", t.dtype)
      if resource_variable_ops.is_resource_variable(t):
        with self:
          t = ops.convert_to_tensor(t)
      elif isinstance(t, np_arrays.ndarray):
        t = t.data
        num_ndarrays += 1
      flat_targets.append(t)
    # Only rewrap if all targets are ndarray. If not, prefer tensors.
    rewrap_as_ndarray = num_ndarrays == len(flat_targets)

    flat_sources = nest.flatten(sources)
    flat_sources_raw = flat_sources
    flat_sources = [_handle_or_self(x) for x in flat_sources]
    for t in flat_sources_raw:
      if not backprop_util.IsTrainable(t):
        logging.vlog(
            logging.WARN, "The dtype of the source tensor must be "
            "floating (e.g. tf.float32) when calling GradientTape.gradient, "
            "got %r", t.dtype)
      if getattr(t, "is_packed", False):
        raise ValueError(
            "GradientTape.gradient is not supported on packed EagerTensors yet."
        )

    if output_gradients is not None:
      output_gradients = [None if x is None else ops.convert_to_tensor(x)
                          for x in nest.flatten(output_gradients)]

    flat_grad = imperative_grad.imperative_grad(
        self._tape,
        flat_targets,
        flat_sources,
        output_gradients=output_gradients,
        sources_raw=flat_sources_raw,
        unconnected_gradients=unconnected_gradients)

    if not self._persistent:
      # Keep track of watched variables before setting tape to None
      self._watched_variables = self._tape.watched_variables()
      self._tape = None

    if rewrap_as_ndarray:
      def _tensor_to_ndarray(x):
        if x is not None:
          return np_arrays.tensor_to_ndarray(x)
        return None
      flat_grad = nest.map_structure(_tensor_to_ndarray, flat_grad)

    grad = nest.pack_sequence_as(sources, flat_grad)
    return grad
    def _get_single_variable(
        self,
        name,
        shape=None,
        dtype=tf.float32,
        initializer=None,
        regularizer=None,
        partition_info=None,
        reuse=None,
        trainable=None,
        caching_device=None,
        validate_shape=True,
        constraint=None,
        synchronization=tf.VariableSynchronization.AUTO,
        aggregation=tf.compat.v1.VariableAggregation.NONE,
    ):
        """Get or create a single Variable (e.g.

        a shard or entire variable).

        See the documentation of get_variable above (ignore partitioning components)
        for details.

        Args:
          name: see get_variable.
          shape: see get_variable.
          dtype: see get_variable.
          initializer: see get_variable.
          regularizer: see get_variable.
          partition_info: _PartitionInfo object.
          reuse: see get_variable.
          trainable: see get_variable.
          caching_device: see get_variable.
          validate_shape: see get_variable.
          constraint: see get_variable.
          synchronization: see get_variable.
          aggregation: see get_variable.

        Returns:
          A Variable.  See documentation of get_variable above.

        Raises:
          ValueError: See documentation of get_variable above.
        """
        # Set to true if initializer is a constant.
        initializing_from_value = False
        if initializer is not None and not callable(initializer):
            initializing_from_value = True
        if shape is not None and initializing_from_value:
            raise ValueError(
                "If initializer is a constant, do not specify shape.")

        dtype = tf.as_dtype(dtype)
        shape = as_shape(shape)

        if name in self._vars:
            # Here we handle the case when returning an existing variable.
            found_var = self._vars[name]
            if not shape.is_compatible_with(found_var.get_shape()):
                raise ValueError(
                    "Trying to share variable %s, but specified shape %s"
                    " and found shape %s." %
                    (name, shape, found_var.get_shape()))
            if not dtype.is_compatible_with(found_var.dtype):
                dtype_str = dtype.name
                found_type_str = found_var.dtype.name
                raise ValueError(
                    "Trying to share variable %s, but specified dtype %s"
                    " and found dtype %s." % (name, dtype_str, found_type_str))
            return found_var

        # The code below handles only the case of creating a new variable.
        if reuse is True:  # pylint: disable=g-bool-id-comparison
            raise ValueError(
                "Variable %s does not exist, or was not created with "
                "tf.get_variable(). Did you mean to set "
                "reuse=tf.AUTO_REUSE in VarScope?" % name)

        # Create the tensor to initialize the variable with default value.
        if initializer is None:
            (
                initializer,
                initializing_from_value,
            ) = self._get_default_initializer(name=name,
                                              shape=shape,
                                              dtype=dtype)
        # Enter an init scope when creating the initializer.
        with tf.init_scope():
            if initializing_from_value:
                init_val = initializer
                variable_dtype = None
            else:
                # Instantiate initializer if provided initializer is a type object.
                if tf_inspect.isclass(initializer):
                    initializer = initializer()
                if shape.is_fully_defined():
                    if ("partition_info"
                            in tf_inspect.getargspec(initializer).args):
                        init_val = functools.partial(
                            initializer,
                            shape.as_list(),
                            dtype=dtype,
                            partition_info=partition_info,
                        )
                    else:
                        init_val = functools.partial(initializer,
                                                     shape.as_list(),
                                                     dtype=dtype)
                    variable_dtype = dtype.base_dtype
                else:
                    init_val = initializer
                    variable_dtype = None

        # Create the variable (Always eagerly as a workaround for a strange
        # tpu / funcgraph / keras functional model interaction )
        with tf.init_scope():
            v = tf.Variable(
                initial_value=init_val,
                name=name,
                trainable=trainable,
                caching_device=caching_device,
                dtype=variable_dtype,
                validate_shape=validate_shape,
                constraint=constraint,
                synchronization=synchronization,
                aggregation=aggregation,
            )

        self._vars[name] = v
        logging.vlog(
            1,
            "Created variable %s with shape %s and init %s",
            v.name,
            format(shape),
            initializer,
        )

        # Run the regularizer if requested and save the resulting loss.
        if regularizer:
            self.add_regularizer(v, regularizer)

        return v
    def testSampleLarge(self):
        mu = np.array([-1., 1], dtype=np.float32)
        scale_tril = np.array([[3., 0], [1, -2]], dtype=np.float32) / 3.

        true_mean = mu
        true_scale = scale_tril
        true_covariance = np.matmul(true_scale, true_scale.T)
        true_variance = np.diag(true_covariance)
        true_stddev = np.sqrt(true_variance)

        dist = tfd.MultivariateNormalTriL(loc=mu,
                                          scale_tril=scale_tril,
                                          validate_args=True)

        # The following distributions will test the KL divergence calculation.
        mvn_chol = tfd.MultivariateNormalTriL(loc=np.array([0.5, 1.2],
                                                           dtype=np.float32),
                                              scale_tril=np.array(
                                                  [[3., 0], [1, 2]],
                                                  dtype=np.float32),
                                              validate_args=True)

        n = int(10e3)
        samps = dist.sample(n, seed=0)
        sample_mean = tf.reduce_mean(samps, 0)
        x = samps - sample_mean
        sample_covariance = tf.matmul(x, x, transpose_a=True) / n

        sample_kl_chol = tf.reduce_mean(
            dist.log_prob(samps) - mvn_chol.log_prob(samps), 0)
        analytical_kl_chol = tfd.kl_divergence(dist, mvn_chol)

        scale = dist.scale.to_dense()

        [
            sample_mean_,
            analytical_mean_,
            sample_covariance_,
            analytical_covariance_,
            analytical_variance_,
            analytical_stddev_,
            sample_kl_chol_,
            analytical_kl_chol_,
            scale_,
        ] = self.evaluate([
            sample_mean,
            dist.mean(),
            sample_covariance,
            dist.covariance(),
            dist.variance(),
            dist.stddev(),
            sample_kl_chol,
            analytical_kl_chol,
            scale,
        ])

        sample_variance_ = np.diag(sample_covariance_)
        sample_stddev_ = np.sqrt(sample_variance_)

        logging.vlog(2, "true_mean:\n{}  ".format(true_mean))
        logging.vlog(2, "sample_mean:\n{}".format(sample_mean_))
        logging.vlog(2, "analytical_mean:\n{}".format(analytical_mean_))

        logging.vlog(2, "true_covariance:\n{}".format(true_covariance))
        logging.vlog(2, "sample_covariance:\n{}".format(sample_covariance_))
        logging.vlog(
            2, "analytical_covariance:\n{}".format(analytical_covariance_))

        logging.vlog(2, "true_variance:\n{}".format(true_variance))
        logging.vlog(2, "sample_variance:\n{}".format(sample_variance_))
        logging.vlog(2,
                     "analytical_variance:\n{}".format(analytical_variance_))

        logging.vlog(2, "true_stddev:\n{}".format(true_stddev))
        logging.vlog(2, "sample_stddev:\n{}".format(sample_stddev_))
        logging.vlog(2, "analytical_stddev:\n{}".format(analytical_stddev_))

        logging.vlog(2, "true_scale:\n{}".format(true_scale))
        logging.vlog(2, "scale:\n{}".format(scale_))

        logging.vlog(
            2, "kl_chol:      analytical:{}  sample:{}".format(
                analytical_kl_chol_, sample_kl_chol_))

        self.assertAllClose(true_mean, sample_mean_, atol=0., rtol=0.03)
        self.assertAllClose(true_mean, analytical_mean_, atol=0., rtol=1e-6)

        self.assertAllClose(true_covariance,
                            sample_covariance_,
                            atol=0.,
                            rtol=0.03)
        self.assertAllClose(true_covariance,
                            analytical_covariance_,
                            atol=0.,
                            rtol=1e-6)

        self.assertAllClose(true_variance,
                            sample_variance_,
                            atol=0.,
                            rtol=0.02)
        self.assertAllClose(true_variance,
                            analytical_variance_,
                            atol=0.,
                            rtol=1e-6)

        self.assertAllClose(true_stddev, sample_stddev_, atol=0., rtol=0.01)
        self.assertAllClose(true_stddev,
                            analytical_stddev_,
                            atol=0.,
                            rtol=1e-6)

        self.assertAllClose(true_scale, scale_, atol=0., rtol=1e-6)

        self.assertAllClose(sample_kl_chol_,
                            analytical_kl_chol_,
                            atol=0.,
                            rtol=0.02)
Beispiel #43
0
def compute_receptive_field_from_graph_def(graph_def, input_node, output_node):
    """Computes receptive field (RF) parameters from a GraphDef object.

  Args:
    graph_def: GraphDef object.
    input_node: Name of the input node from graph.
    output_node: Name of the output node from graph.

  Returns:
    rf_size_x: Receptive field size of network in the horizontal direction, with
      respect to specified input and output.
    rf_size_y: Receptive field size of network in the vertical direction, with
      respect to specified input and output.
    effective_stride_x: Effective stride of network in the horizontal direction,
      with respect to specified input and output.
    effective_stride_y: Effective stride of network in the vertical direction,
      with respect to specified input and output.
    effective_padding_x: Effective padding of network in the horizontal
      direction, with respect to specified input and output.
    effective_padding_y: Effective padding of network in the vertical
      direction, with respect to specified input and output.

  Raises:
    ValueError: If network is not aligned or if either input or output nodes
      cannot be found. For network criterion alignment, see
      photos/vision/features/delf/g3doc/rf_computation.md
  """
    # Computes order of computation for a given graph.
    name_to_order_node = graph_compute_order.get_compute_order(
        graph_def=graph_def)

    # Sort in reverse topological order.
    order = _reverse_sort_by_order(name_to_order_node)

    # Dictionaries to keep track of receptive field, effective stride and
    # effective padding of different nodes.
    rf_sizes_x = {}
    rf_sizes_y = {}
    effective_strides_x = {}
    effective_strides_y = {}
    effective_paddings_x = {}
    effective_paddings_y = {}

    # Initialize dicts for output_node.
    rf_sizes_x[output_node] = 1
    rf_sizes_y[output_node] = 1
    effective_strides_x[output_node] = 1
    effective_strides_y[output_node] = 1
    effective_paddings_x[output_node] = 0
    effective_paddings_y[output_node] = 0

    # Flag to denote if we found output node yet. If we have not, we skip nodes
    # until the output node is found.
    found_output_node = False

    # Flag to denote if padding is undefined. This happens when SAME padding mode
    # is used in conjunction with stride and kernel sizes which make it such that
    # the padding to be applied would depend on the input size. In this case,
    # alignment checks are skipped, and the effective padding is None.
    undefined_padding = False

    for _, (o, node) in order:
        if node:
            logging.vlog(3, "%10d %-100s %-20s" % (o, node.name[:90], node.op))
        else:
            continue

        # When we find input node, we can stop.
        if node.name == input_node:
            break

        # Loop until we find the output node. All nodes before finding the output
        # one are irrelevant, so they can be skipped.
        if not found_output_node:
            if node.name == output_node:
                found_output_node = True

        if found_output_node:
            if node.name not in rf_sizes_x:
                assert node.name not in rf_sizes_y, (
                    "Node %s is in rf_sizes_y, but "
                    "not in rf_sizes_x" % node.name)
                # In this case, node is not relevant since it's not part of the
                # computation we're interested in.
                logging.vlog(3, "Irrelevant node %s, skipping it...",
                             node.name)
                continue

            # Get params for this layer.
            kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x, padding_y = (
                _get_layer_params(node, name_to_order_node))
            logging.vlog(
                3, "kernel_size_x = %s, kernel_size_y = %s, "
                "stride_x = %s, stride_y = %s, "
                "padding_x = %s, padding_y = %s" %
                (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
                 padding_y))
            if padding_x is None or padding_y is None:
                undefined_padding = True

            # Get parameters at input of this layer which may or may not be propagated
            # to the input layers.
            rf_size_input_x = _get_rf_size_node_input(stride_x, kernel_size_x,
                                                      rf_sizes_x[node.name])
            rf_size_input_y = _get_rf_size_node_input(stride_y, kernel_size_y,
                                                      rf_sizes_y[node.name])
            effective_stride_input_x = _get_effective_stride_node_input(
                stride_x, effective_strides_x[node.name])
            effective_stride_input_y = _get_effective_stride_node_input(
                stride_y, effective_strides_y[node.name])
            if not undefined_padding:
                effective_padding_input_x = _get_effective_padding_node_input(
                    stride_x, padding_x, effective_paddings_x[node.name])
                effective_padding_input_y = _get_effective_padding_node_input(
                    stride_y, padding_y, effective_paddings_y[node.name])
            else:
                effective_padding_input_x = None
                effective_padding_input_y = None

            # Loop over this node's inputs and potentially propagate information down.
            for inp_name in node.input:
                logging.vlog(4, "inp_name = %s", inp_name)
                inp_node = name_to_order_node[inp_name].node
                logging.vlog(4, "inp_node = \n%s", inp_node)
                if inp_node.name in rf_sizes_x:
                    assert inp_node.name in rf_sizes_y, (
                        "Node %s is in rf_sizes_x, but "
                        "not in rf_sizes_y" % inp_node.name)
                    # This node was already discovered through a previous path, so we need
                    # to make sure that graph is aligned. This alignment check is skipped
                    # if the padding is not defined, since in this case alignment cannot
                    # be checked.
                    if not undefined_padding:
                        if effective_strides_x[
                                inp_node.name] != effective_stride_input_x:
                            raise ValueError(
                                "Graph is not aligned since effective stride from different "
                                "paths is different in horizontal direction")
                        if effective_strides_y[
                                inp_node.name] != effective_stride_input_y:
                            raise ValueError(
                                "Graph is not aligned since effective stride from different "
                                "paths is different in vertical direction")
                        if (rf_sizes_x[inp_node.name] - 1
                            ) / 2 - effective_paddings_x[inp_node.name] != (
                                rf_size_input_x -
                                1) / 2 - effective_padding_input_x:
                            raise ValueError(
                                "Graph is not aligned since center shift from different "
                                "paths is different in horizontal direction")
                        if (rf_sizes_y[inp_node.name] - 1
                            ) / 2 - effective_paddings_y[inp_node.name] != (
                                rf_size_input_y -
                                1) / 2 - effective_padding_input_y:
                            raise ValueError(
                                "Graph is not aligned since center shift from different "
                                "paths is different in vertical direction")
                    # Keep track of path with largest RF, for both directions.
                    if rf_sizes_x[inp_node.name] < rf_size_input_x:
                        rf_sizes_x[inp_node.name] = rf_size_input_x
                        effective_strides_x[
                            inp_node.name] = effective_stride_input_x
                        effective_paddings_x[
                            inp_node.name] = effective_padding_input_x
                    if rf_sizes_y[inp_node.name] < rf_size_input_y:
                        rf_sizes_y[inp_node.name] = rf_size_input_y
                        effective_strides_y[
                            inp_node.name] = effective_stride_input_y
                        effective_paddings_y[
                            inp_node.name] = effective_padding_input_y
                else:
                    assert inp_node.name not in rf_sizes_y, (
                        "Node %s is in rf_sizes_y, but "
                        "not in rf_sizes_x" % inp_node.name)
                    # In this case, it is the first time we encounter this node. So we
                    # propagate the RF parameters.
                    rf_sizes_x[inp_node.name] = rf_size_input_x
                    rf_sizes_y[inp_node.name] = rf_size_input_y
                    effective_strides_x[
                        inp_node.name] = effective_stride_input_x
                    effective_strides_y[
                        inp_node.name] = effective_stride_input_y
                    effective_paddings_x[
                        inp_node.name] = effective_padding_input_x
                    effective_paddings_y[
                        inp_node.name] = effective_padding_input_y

    if not found_output_node:
        raise ValueError("Output node was not found")
    if input_node not in rf_sizes_x:
        raise ValueError("Input node was not found")
    return (rf_sizes_x[input_node], rf_sizes_y[input_node],
            effective_strides_x[input_node], effective_strides_y[input_node],
            effective_paddings_x[input_node], effective_paddings_y[input_node])
Beispiel #44
0
    def _ais_gets_correct_log_normalizer(self,
                                         init,
                                         independent_chain_ndims,
                                         sess,
                                         feed_dict=None):
        counter = collections.Counter()

        def proposal_log_prob(x):
            counter["proposal_calls"] += 1
            event_dims = math_ops.range(independent_chain_ndims,
                                        array_ops.rank(x))
            return -0.5 * math_ops.reduce_sum(x**2. + np.log(2 * np.pi),
                                              axis=event_dims)

        def target_log_prob(x):
            counter["target_calls"] += 1
            event_dims = math_ops.range(independent_chain_ndims,
                                        array_ops.rank(x))
            return self._log_gamma_log_prob(x, event_dims)

        if feed_dict is None:
            feed_dict = {}

        num_steps = 200

        _, ais_weights, _ = hmc.sample_annealed_importance_chain(
            proposal_log_prob_fn=proposal_log_prob,
            num_steps=num_steps,
            target_log_prob_fn=target_log_prob,
            step_size=0.5,
            current_state=init,
            num_leapfrog_steps=2,
            seed=45)

        # We have three calls because the calculation of `ais_weights` entails
        # another call to the `convex_combined_log_prob_fn`. We could refactor
        # things to avoid this, if needed (eg, b/72994218).
        self.assertAllEqual(dict(target_calls=3, proposal_calls=3), counter)

        event_shape = array_ops.shape(init)[independent_chain_ndims:]
        event_size = math_ops.reduce_prod(event_shape)

        log_true_normalizer = (
            -self._shape_param * math_ops.log(self._rate_param) +
            math_ops.lgamma(self._shape_param))
        log_true_normalizer *= math_ops.cast(event_size,
                                             log_true_normalizer.dtype)

        log_estimated_normalizer = (math_ops.reduce_logsumexp(ais_weights) -
                                    np.log(num_steps))

        ratio_estimate_true = math_ops.exp(ais_weights - log_true_normalizer)
        ais_weights_size = array_ops.size(ais_weights)
        standard_error = math_ops.sqrt(
            _reduce_variance(ratio_estimate_true) /
            math_ops.cast(ais_weights_size, ratio_estimate_true.dtype))

        [
            ratio_estimate_true_,
            log_true_normalizer_,
            log_estimated_normalizer_,
            standard_error_,
            ais_weights_size_,
            event_size_,
        ] = sess.run([
            ratio_estimate_true,
            log_true_normalizer,
            log_estimated_normalizer,
            standard_error,
            ais_weights_size,
            event_size,
        ], feed_dict)

        logging_ops.vlog(
            1, "        log_true_normalizer: {}\n"
            "   log_estimated_normalizer: {}\n"
            "           ais_weights_size: {}\n"
            "                 event_size: {}\n".format(
                log_true_normalizer_, log_estimated_normalizer_,
                ais_weights_size_, event_size_))
        self.assertNear(ratio_estimate_true_.mean(), 1., 4. * standard_error_)
Beispiel #45
0
def _get_computed_nodes(name_to_node,
                        current,
                        node_info,
                        input_node_name='',
                        input_node_size=None):
    """Traverses the graph recursively to compute its topological order.

  Optionally, the function may also compute the input and output feature map
  resolutions at each node. In this case, input_node_name and input_node_size
  must be set. Note that if a node's op type is unknown, the input and output
  resolutions are ignored and set to None.

  Args:
    name_to_node: Dict keyed by node name, each entry containing the node's
      NodeDef.
    current: Current node name.
    node_info: Map of nodes we've already traversed, containing their _node_info
      information.
    input_node_name: Name of node with fixed input resolution (optional).
    input_node_size: Fixed input resolution to use (optional).
  Returns:
    order: Order in topological sort for 'current'.
    input_size: Tensor spatial resolution at input of current node.
    output_size: Tensor spatial resolution at output of current node.
  """
    if current in node_info:
        return (node_info[current].order, node_info[current].input_size,
                node_info[current].output_size)

    node_def = name_to_node[current]

    if current == input_node_name:
        order = 0
        input_size = None
        output_size = input_node_size
        node_info[current] = _node_info(order, node_def, input_size,
                                        output_size)
        return (order, input_size, output_size)

    input_size = None
    output_size = None

    order = 0
    number_inputs = 0
    for each in node_def.input:
        # Parses name of input node.
        if each.startswith('^'):
            # The character '^' denotes a control dependency, so this input node can
            # be safely ignored.
            continue
        each = each.split(':')[0]
        # Recursively computes ordering.
        (parent_order, _,
         parent_output_size) = _get_computed_nodes(name_to_node, each,
                                                   node_info, input_node_name,
                                                   input_node_size)
        order = max(order, parent_order + 1)
        if number_inputs == 0:
            # For all the types of nodes we consider, the first input corresponds to
            # the feature map.
            input_size = parent_output_size
        number_inputs += 1

    # Figure out output size for this layer.
    logging.vlog(3, 'input_size = %s', input_size)
    if input_size is None:
        output_size = None
    else:
        (kernel_size_x, kernel_size_y, stride_x, stride_y, _, _,
         total_padding_x,
         total_padding_y) = (parse_layer_parameters.get_layer_params(
             node_def, name_to_node, input_size, force=True))
        logging.vlog(
            3, 'kernel_size_x = %s, kernel_size_y = %s, '
            'stride_x = %s, stride_y = %s, '
            'total_padding_x = %s, total_padding_y = %s' %
            (kernel_size_x, kernel_size_y, stride_x, stride_y, total_padding_x,
             total_padding_y))
        output_size = [None] * 2
        output_size[0] = _compute_output_resolution(input_size[0],
                                                    kernel_size_x, stride_x,
                                                    total_padding_x)
        output_size[1] = _compute_output_resolution(input_size[1],
                                                    kernel_size_y, stride_y,
                                                    total_padding_y)

    logging.vlog(3, 'output_size = %s', output_size)
    node_info[current] = _node_info(order, node_def, input_size, output_size)

    return order, input_size, output_size
Beispiel #46
0
def RunLSTM(sess,
            num_units,
            input_size,
            batch_size,
            time,
            num_layers=1,
            variable_seq_lengths=False,
            time_major=True,
            dynamic_shape_input=False,
            is_training=True,
            dropout=0.,
            num_dirs=True,
            dtype=dtypes.float32):
  # TODO(jamesqin): add multi-layer tests.
  # TODO(jamesqin): add multi-dir tests
  assert num_layers == 1
  assert num_dirs == 1
  if is_training and not np.isclose(dropout, 0):
    raise ValueError("dropout can not be 0. when test training.")

  # set graph level random seed and numpy random seed.
  random_seed.set_random_seed(0)
  np.random.seed(0)

  shape = ([time, batch_size, input_size]
           if time_major else [batch_size, time, input_size])
  inputs_np = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
  inputs_static = variable_scope.get_variable(
      "inputs", initializer=inputs_np, dtype=dtype)
  inputs_dynamic = array_ops.placeholder(
      dtype, shape=[None, None, None], name="inputs")
  inputs = inputs_dynamic if dynamic_shape_input else inputs_static
  initial_h_op = variable_scope.get_variable(
      "initial_h_op",
      initializer=np.random.rand(batch_size,
                                 num_units).astype(dtype.as_numpy_dtype),
      dtype=dtype)
  initial_c_op = variable_scope.get_variable(
      "initial_c_op",
      initializer=np.random.rand(batch_size,
                                 num_units).astype(dtype.as_numpy_dtype),
      dtype=dtype)

  if variable_seq_lengths:
    lengths_v = np.random.randint(low=1, high=time + 1, size=batch_size)
    lengths_v[0] = time  # make sure the max sequence has 'time' elems
    lengths = ops.convert_to_tensor(lengths_v.astype(np.int32))
  else:
    lengths = None

  initializer = init_ops.random_uniform_initializer(
      -0.01, 0.01, dtype=dtype, seed=19980904)

  with variable_scope.variable_scope("test", initializer=initializer):
    w = variable_scope.get_variable(
        "rnn/lstm_cell/kernel",
        shape=[input_size + num_units, num_units * 4],
        dtype=dtype)
    b = variable_scope.get_variable(
        "rnn/lstm_cell/bias", shape=[num_units * 4], dtype=dtype)

    # canonical lstm. must set forget_bias to 0. to align with cudnn lstm.
    cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True)
    outputs_op, state_tuple_op = rnn.dynamic_rnn(
        cell,
        inputs_static,
        sequence_length=lengths,
        initial_state=rnn_cell_impl.LSTMStateTuple(
            h=initial_h_op, c=initial_c_op),
        dtype=dtype,
        time_major=time_major,
        scope=None)

  # Convert to cudnn opaque param.
  format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM(
      num_layers, num_units, input_size)
  opaque_params = format_converter.tf_canonical_to_opaque([w, b])

  cu_initial_h_op = array_ops.expand_dims(
      initial_h_op, axis=(0 if time_major else 1))
  cu_initial_c_op = array_ops.expand_dims(
      initial_c_op, axis=(0 if time_major else 1))
  cu_outputs_op, cu_h_op, cu_c_op = cudnn_rnn_ops._cudnn_rnn(
      inputs,
      cu_initial_h_op,
      cu_initial_c_op,
      opaque_params,
      sequence_lengths=lengths,
      time_major=time_major,
      dropout=dropout,
      is_training=is_training,
      rnn_mode=cudnn_rnn_ops.CUDNN_LSTM)
  # Remove the trivial 1st dimension.
  cu_state_tuple_op = rnn_cell_impl.LSTMStateTuple(
      c=array_ops.squeeze(cu_c_op, axis=0 if time_major else 1),
      h=array_ops.squeeze(cu_h_op, axis=0 if time_major else 1))

  if is_training:
    (inp_grad_op, hgrad_op,
     cgrad_op, wgrad_op, bgrad_op) = gradients_impl.gradients(
         outputs_op, [inputs_static, initial_h_op, initial_c_op, w, b])

    (cu_inp_grad_op, cu_hgrad_op,
     cu_cgrad_op, opaque_grad_op) = gradients_impl.gradients(
         cu_outputs_op,
         [inputs, cu_initial_h_op, cu_initial_c_op, opaque_params])
    # Remove the trivial 1st dimension
    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0 if time_major else 1)
    # Remove the trivial 1st dimension
    cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0 if time_major else 1)

    cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
        opaque_grad_op)
    cu_wgrad_op = cu_wgrad_op[0]
    cu_bgrad_op = cu_bgrad_op[0]
    # cudnn lstm has 2 biases each gate. When converting to tf canonical format,
    # the two biases are summed into one. Thus here bias gradient should be
    # halved when comparing with tf lstm.
    cu_bgrad_op *= 0.5

  init_op = variables.global_variables_initializer()
  sess.run(init_op)

  if is_training:
    outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad = sess.run([
        outputs_op, state_tuple_op, inp_grad_op,
        (hgrad_op, cgrad_op), wgrad_op, bgrad_op
    ])
    (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad,
     cu_bgrad) = sess.run(
         [
             cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op,
             (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op
         ],
         feed_dict={inputs: inputs_np} if dynamic_shape_input else None)

    logging.vlog(1, "outputs: %s" % outputs)
    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
    logging.vlog(1, "inp_grad: %s" % inp_grad)
    logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
    logging.vlog(1, "state_grad: %s" % str(state_grad))
    logging.vlog(1, "cu_state_grad: %s" % str(cu_state_grad))
    logging.vlog(1, "wgrad: %s" % str(wgrad))
    logging.vlog(1, "bgrad: %s" % str(bgrad))
    logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
    logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
    return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad,
            cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad,
            cu_bgrad)
  else:
    outputs, state_tuple = sess.run([outputs_op, state_tuple_op])
    cu_outputs, cu_state_tuple = sess.run([cu_outputs_op, cu_state_tuple_op],
                                          feed_dict=({
                                              inputs: inputs_np
                                          } if dynamic_shape_input else None))

    logging.vlog(1, "outputs: %s" % outputs)
    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
  return outputs, cu_outputs, state_tuple, cu_state_tuple
def _compute_numeric_jacobian(f, y_size, y_dtype, xs, param, delta):
    """Computes the numeric Jacobian for f regarding xs[param].

  One can think of the relation among f, xs and y as y = f(xs).

  Args:
    f: the function.
    y_size: the number of elements of the result.
    y_dtype: the dtype of the result.
    xs: a list of tensors.
    param: the index of the target parameter.
    delta: the amount of perturbation we give to the input.

  Returns:
    A 2-d numpy array representing the Jacobian. It has "x_size" rows
    and "y_size" columns where "x_size" is the number of elements in xs[param]
    and "y_size" is the number of elements in the result.
  """
    # bfloat16 doesn't have enough bits to represent high precision numbers such
    # as delta. Convert to float32 here. Since numeric_jacobian is expected to
    # be the groundtruth to compare against, it shouldn't lose any information.
    x_shape = xs[param].shape
    x_dtype = xs[param].dtype
    if y_dtype == dtypes.bfloat16:
        f = lambda *xs: math_ops.cast(f(*xs), dtypes.float32)
        y_dtype = dtypes.float32

    # To compute the jacobian, we treat x and y as one-dimensional vectors
    x_size = _product(x_shape) * (2 if x_dtype.is_complex else 1)
    y_size = y_size * (2 if y_dtype.is_complex else 1)
    x_dtype = x_dtype.real_dtype.as_numpy_dtype
    y_dtype = y_dtype.real_dtype.as_numpy_dtype

    xs_dtypes = [x.dtype for x in xs]
    # Converts xs to numpy arrays to do in-place perturbation.
    # Calls asarray() to avoid copying in ravel() later.
    xs = [np.asarray(_to_numpy(x)) for x in xs]
    x = xs[param]

    # Make sure we have the right types
    scale = np.asarray(2 * delta, dtype=y_dtype)[()]

    jacobian = np.zeros((x_size, y_size), dtype=x_dtype)
    # For each of the entry of x, we slightly perturbs this by adding and
    # subtracting a delta and then compute difference between the outputs. This
    # will give us one row of the Jacobian matrix.

    f = _prepare(f, xs_dtypes)
    for row in range(x_size):
        original = x.ravel().view(x_dtype)[row]
        x.ravel().view(x_dtype)[row] += delta
        y_pos = _to_numpy(f(*xs))
        x.ravel().view(x_dtype)[row] = original
        x.ravel().view(x_dtype)[row] -= delta
        y_neg = _to_numpy(f(*xs))
        x.ravel().view(x_dtype)[row] = original
        diff = (y_pos - y_neg) / scale
        jacobian[row, :] = diff.ravel().view(y_dtype)

    logging.vlog(1, "Numeric Jacobian =\n%s", jacobian)
    return jacobian
def get_layer_params(node, name_to_node, input_resolution=None, force=False):
    """Gets layer parameters relevant for RF computation.

  Currently, only these nodes are supported:
  - Conv2D
  - DepthwiseConv2dNative
  - Pad
  - MaxPool
  - AvgPool
  - all nodes listed in _UNCHANGED_RF_LAYER_OPS

  Args:
    node: Tensorflow node (NodeDef proto).
    name_to_node: Dict keyed by node name, each entry containing the node's
      NodeDef.
    input_resolution: List with 2 dimensions, denoting the height/width of the
      input feature map to this layer. If set to None, then the padding may be
      undefined (in tensorflow, SAME padding depends on input spatial
      resolution).
    force: If True, the function does not raise a ValueError if the layer op is
      unknown. Instead, in this case it sets each of the returned parameters to
      None.

  Returns:
    kernel_size_x: Kernel size for horizontal direction (integer).
    kernel_size_y: Kernel size for vertical direction (integer).
    stride_x: Stride size for horizontal direction (integer).
    stride_y: Stride size for vertical direction (integer).
    padding_x: Padding size for horizontal direction, left side (integer).
    padding_y: Padding size for vertical direction, top side (integer).
    total_padding_x: Total padding size for horizontal direction (integer).
    total_padding_y: Total padding size for vertical direction (integer).

  Raises:
    ValueError: If layer op is unknown and force is False.
  """
    logging.vlog(3, "node.name = %s", node.name)
    logging.vlog(3, "node.op = %s", node.op)
    logging.vlog(4, "node = %s", node)
    if node.op == "Conv2D" or node.op == "DepthwiseConv2dNative":
        stride_x, stride_y = _stride_size(node, name_to_node)
        kernel_size_x, kernel_size_y = _conv_kernel_size(node, name_to_node)
        # Compute the padding for this node separately for each direction.
        total_padding_x, padding_x = _padding_size_conv_pool(
            node, kernel_size_x, stride_x,
            input_resolution[1] if input_resolution is not None else None)
        total_padding_y, padding_y = _padding_size_conv_pool(
            node, kernel_size_y, stride_y,
            input_resolution[0] if input_resolution is not None else None)
    elif node.op == "Pad":
        # Kernel and stride are simply 1 in this case.
        kernel_size_x = 1
        kernel_size_y = 1
        stride_x = 1
        stride_y = 1
        total_padding_x, padding_x, total_padding_y, padding_y = (
            _padding_size_pad_layer(node, name_to_node))
    elif node.op == "MaxPool" or node.op == "MaxPoolV2" or node.op == "AvgPool":
        stride_x, stride_y = _stride_size(node, name_to_node)
        kernel_size_x, kernel_size_y = _pool_kernel_size(node, name_to_node)
        # Compute the padding for this node separately for each direction.
        total_padding_x, padding_x = _padding_size_conv_pool(
            node, kernel_size_x, stride_x,
            input_resolution[1] if input_resolution is not None else None)
        total_padding_y, padding_y = _padding_size_conv_pool(
            node, kernel_size_y, stride_y,
            input_resolution[0] if input_resolution is not None else None)
    elif node.op in _UNCHANGED_RF_LAYER_OPS:
        # These nodes do not modify the RF parameters.
        kernel_size_x = 1
        kernel_size_y = 1
        stride_x = 1
        stride_y = 1
        total_padding_x = 0
        padding_x = 0
        total_padding_y = 0
        padding_y = 0
    else:
        if force:
            kernel_size_x = None
            kernel_size_y = None
            stride_x = None
            stride_y = None
            total_padding_x = None
            padding_x = None
            total_padding_y = None
            padding_y = None
        else:
            raise ValueError("Unknown layer for operation '%s': %s" %
                             (node.name, node.op))
    return (kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x,
            padding_y, total_padding_x, total_padding_y)
Beispiel #49
0
def create_tpu_mesh(mesh_dim_names: List[str],
                    mesh_shape: List[int],
                    mesh_name: str,
                    ring_dims: Optional[int] = None,
                    ring_axes: Optional[List[str]] = None,
                    ring_bounds: Optional[List[int]] = None,
                    can_split_host_across_rings: bool = True,
                    build_ring_across_rings: bool = False,
                    rotate_ring_across_rings: bool = False) -> layout_lib.Mesh:
  """Returns a TPU mesh optimized for AllReduce ring reductions.

  Only as many as leading axes specified by `ring_axes` as necessary will be
  used to build rings, as long as the subslice formed by these axes have enough
  cores to contain a ring of the required size. The leftover axes in `ring_axes`
  won't affect results.

  See go/dtensor-device-assignment-api for details and performance tuning tips.

  Args:
    mesh_dim_names: List of mesh dimension names.
    mesh_shape: Shape of the mesh.
    mesh_name: A unique name for the mesh. If empty, internally generate one.
    ring_dims: Optional; The number of leading (ring_dims > 0) or trailing
      (ring_dims < 0) mesh dimensions to build rings for. If unspecified, build
      rings for all but the first dimension.
    ring_axes: Optional; A permutation of ["x", "y", "z", "core"], specifying
      the order of TPU topology axes to build rings in. If unspecified, default
      to ["core", "x", "y", "z"].
    ring_bounds: Optional; The maximum number of devices on each axis, in the x,
      y, z, core order. If unspecified, default to physical topology limits.
    can_split_host_across_rings: Optional; If true, devices attached to the same
      host (i.e., DTensor client) may get assigned to different rings. Setting
      it to false may cause some combinations of arguments to be infeasible; see
      DeviceAssignmentTest.testCreateMesh[No]SplittingHosts* for examples.
    build_ring_across_rings: Optional; If true, also build a data-parallel ring
      across model-parallel rings. This ring could be strided.
    rotate_ring_across_rings: Optional; If true, build the data-parallel ring in
      column-major instead of row-major order.
  """

  logging.info("Building a TPU mesh %s of shape %s", mesh_name, mesh_shape)
  logging.info("Requested ring_dims: %s", ring_dims)
  logging.info("Requested ring_axes: %s", ring_axes)
  logging.info("Requested ring_bounds: %s", ring_bounds)
  logging.info("Requested can_split_host_across_rings: %s",
               can_split_host_across_rings)
  if not mesh_name:
    mesh_name = "mesh_%f" % time.time()
  logging.info("Requested mesh_name: %s", mesh_name)

  # By default, build rings for all but the first (usually batch) dimension.
  if ring_dims is None:
    ring_dims = 1 - len(mesh_shape)
  elif ring_dims < -len(mesh_shape) or ring_dims > len(mesh_shape):
    raise ValueError("Invalid ring_dims value: %d" % ring_dims)
  logging.info("Actual ring_dims: %s", ring_dims)

  # By default, vary axes in the core -> x -> y -> z order.
  if ring_axes is None:
    ring_axes = ["core", "x", "y", "z"]
  elif len(ring_axes) != 4:
    raise ValueError("Expected 4 elements in ring_axes, got %s" % ring_axes)
  elif sorted(ring_axes) != ["core", "x", "y", "z"]:
    raise ValueError("Invalid ring_axes value: %s" % ring_axes)
  logging.info("Actual ring_axes: %s", ring_axes)

  # Validate ring_bounds values.
  global _tpu_topology
  if _tpu_topology is None:
    raise ValueError(
        "Invalid TPU topology, run dtensor_initialize_tpu_system() first")
  topology_shape = list(_tpu_topology.mesh_shape)
  if ring_bounds is None:
    ring_bounds = topology_shape
  elif len(ring_bounds) != 4:
    raise ValueError("Expected 4 elements in ring_bounds, got %s" % ring_bounds)
  elif ring_bounds > topology_shape:
    raise ValueError("ring_bounds %s should be <= topology sizes %s" %
                     (ring_bounds, topology_shape))
  logging.info("Actual ring_bounds: %s", ring_bounds)

  # Compute ring_size, the number of cores in a ring.
  if ring_dims > 0:
    ring_size = np.prod(mesh_shape[:ring_dims])
  elif ring_dims < 0:
    ring_size = np.prod(mesh_shape[ring_dims:])
  else:
    ring_size = 1  # single-core rings
  logging.info("Actual ring_size: %d", ring_size)

  # Rearrange all cores according to the axis iteration order.
  global_core_locations = _enumerate_core_locations(
      topology_shape, ring_bounds, ring_axes, can_split_host_across_rings,
      ring_size)
  logging.vlog(1, "Enumerated core locations: %s", global_core_locations)
  num_cores = len(global_core_locations)

  # The mesh to be created must use all TPU cores in the system.
  mesh_size = np.prod(mesh_shape)
  if mesh_size != num_cores:
    raise ValueError(
        "Invalid mesh size: mesh shape %s cannot 1:1 map to %d TPU cores" %
        (mesh_shape, num_cores))

  # Build a ring for the `ring_size` dimension and, if required, a strided ring
  # for the orthogonal dimension.
  if build_ring_across_rings:
    global_core_locations = _build_orthogonal_rings(global_core_locations,
                                                    ring_size,
                                                    rotate_ring_across_rings)
  else:
    permutation = _build_all_reduce_ring(global_core_locations[:ring_size])
    for r in range(0, num_cores, ring_size):
      global_core_locations[r:r + ring_size] = [
          global_core_locations[r + permutation[i]] for i in range(ring_size)
      ]
    logging.vlog(1, "Permutated core locations: %s", global_core_locations)

  # For this point on, change from List[CoreLocation] to List[List[int]] for
  # easier interaction with the C++ API.
  global_core_locations = [l.to_list() for l in global_core_locations]
  global _dtensor_device
  if _dtensor_device is None:
    raise ValueError(
        "Invalid system device, run dtensor_initialize_tpu_system() first")
  global_core_ids = _dtensor_device.tpu_core_locations_to_ids(
      global_core_locations)

  # Store a per-mesh mapping in the runtime.
  _dtensor_device.set_tpu_core_ids(mesh_name, global_core_ids)

  # Create the mesh by manually specifying local_device_ids.
  local_core_locations = _tpu_topology.device_coordinates[api.client_id()]
  indexes = [
      global_core_locations.index(list(local_core_location))
      for local_core_location in local_core_locations
  ]
  global_device_ids, local_device_ids, local_device_list = _create_device_array(
      mesh_shape, _TPU_DEVICE_TYPE, None, local_device_ids=indexes)
  return layout_lib.Mesh(mesh_dim_names, global_device_ids, local_device_ids,
                         local_device_list, mesh_name)
Beispiel #50
0
def _compute_theoretical_jacobian(f, y_shape, y_dtype, xs, param):
    """Computes the theoretical Jacobian for f regarding xs[param].

  One can think of the relation among f, xs and y as y = f(xs).

  Args:
    f: the function.
    y_shape: the shape of the result.
    y_dtype: the dtype of the result.
    xs: a list of tensors.
    param: the index of the target parameter.

  Returns:
    A 2-d numpy array representing the Jacobian. It has "y_size" rows
    and "x_size" columns where "x_size" is the number of elements in xs[param]
    and "y_size" is the number of elements in the result.

  Raises:
    ValueError: If result is empty but the gradient is nonzero.
  """
    x = xs[param]
    # Complex vectors are treated as vectors of twice as many reals.
    x_shape = tuple(x.shape) + (2, ) if x.dtype.is_complex else x.shape
    y_factor = 2 if y_dtype.is_complex else 1

    # To compute the jacobian, we treat x and y as one-dimensional vectors.
    x_size = _product(x_shape)
    x_val_size = _product(x_shape[1:])  # This is used for sparse gradients
    y_size = _product(y_shape) * y_factor

    # Allocate 2-D Jacobian, with y dimensions smashed into the first
    # dimension and x dimensions smashed into the second.
    jacobian = np.zeros((y_size, x_size),
                        dtype=x.dtype.real_dtype.as_numpy_dtype)

    # For each of the entry of dy, we set this to be 1 and
    # everything else to be 0 and compute the gradients -- this will give us one
    # row of the Jacobian matrix.
    dy_data = np.zeros(y_shape, dtype=y_dtype.as_numpy_dtype)
    dy_data_flat = dy_data.ravel().view(y_dtype.real_dtype.as_numpy_dtype)
    grad_fn_unprep = backprop.gradients_function(f, [param])
    grad_fn = _prepare(lambda dy, *xs: grad_fn_unprep(*xs, dy=dy),
                       [y_dtype] + [z.dtype for z in xs],
                       [None] + [z.shape for z in xs])
    for row in range(y_size):
        dy_data_flat[row] = 1
        grad = _to_numpy(grad_fn(dy_data, *xs)[0])
        grad = _eval_indexed_slices(grad)
        if isinstance(grad, indexed_slices.IndexedSlicesValue):
            for i, v in zip(grad.indices, grad.values):
                c_begin = i * x_val_size
                c_end = c_begin + x_val_size
                jacobian[row, c_begin:c_end] += v.flat
        elif grad is not None:
            jacobian[row, :] = grad.ravel().view(jacobian.dtype)
        # This reset of `dy_data_flat` needs to happen after `grad` is copied to
        # `jacobian` because `grad` and `dy_data_flat` may share memory.
        dy_data_flat[row] = 0

    # If the output is empty, run the gradients at least once and make sure
    # they produce zeros.
    if y_size == 0:  # don't use 'not y_size', because y_size may not be an int
        grad = _to_numpy(grad_fn(dy_data, *xs)[0])
        if grad.shape != x.shape:
            raise ValueError(
                "Empty gradient has wrong shape: expected %s, got %s" %
                (x.shape, grad.shape))
        if np.any(grad):
            raise ValueError("Empty tensor with nonzero gradients")

    logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian)
    return jacobian
Beispiel #51
0
def _AggregatedGrads(grads,
                     op,
                     gradient_uid,
                     loop_state,
                     aggregation_method=None):
    """Get the aggregated gradients for op.

  Args:
    grads: The map of memoized gradients.
    op: The op to get gradients for.
    gradient_uid: A unique identifier within the graph indicating
      which invocation of gradients is being executed. Used to cluster
      ops for compilation.
    loop_state: An object for maintaining the state of the while loops in the
                graph. It is of type ControlFlowState. None if the graph
                contains no while loops.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.

  Returns:
    A list of gradients, one per each output of `op`. If the gradients
      for a particular output is a list, this function aggregates it
      before returning.

  Raises:
    TypeError: if the incoming grads are not Tensors or IndexedSlices.
    ValueError: if the arguments are invalid.

  """
    if aggregation_method is None:
        aggregation_method = AggregationMethod.DEFAULT
    if aggregation_method not in [
            AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE,
            AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
    ]:
        raise ValueError("Invalid aggregation_method specified %s." %
                         aggregation_method)
    out_grads = _GetGrads(grads, op)
    for i, out_grad in enumerate(out_grads):
        if loop_state:
            if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
                assert control_flow_util.IsLoopSwitch(op)
                continue
        # Grads have to be Tensors or IndexedSlices
        if (isinstance(out_grad, collections_abc.Sequence) and not all(
                isinstance(g, (ops.Tensor, ops.IndexedSlices))
                for g in out_grad if g is not None)):
            raise TypeError("gradients have to be either all Tensors "
                            "or all IndexedSlices")
        # Aggregate multiple gradients, and convert [] to None.
        if out_grad:
            if len(out_grad) < 2:
                used = "nop"
                out_grads[i] = out_grad[0]
            elif all(
                    isinstance(g, ops.Tensor) for g in out_grad
                    if g is not None):
                tensor_shape = _AccumulatorShape(out_grad)
                if aggregation_method in [
                        AggregationMethod.EXPERIMENTAL_TREE,
                        AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
                ]:
                    # Aggregate all gradients by doing pairwise sums: this may
                    # reduce performance, but it can improve memory because the
                    # gradients can be released earlier.
                    #
                    # TODO(vrv): Consider replacing this with a version of
                    # tf.AddN() that eagerly frees its inputs as soon as they are
                    # ready, so the order of this tree does not become a problem.
                    used = "tree"
                    with ops.name_scope(op.name + "_gradient_sum"):
                        running_sum = out_grad[0]
                        for grad in out_grad[1:]:
                            running_sum = math_ops.add_n([running_sum, grad])
                        out_grads[i] = running_sum
                else:
                    used = "add_n"
                    out_grads[i] = _MultiDeviceAddN(out_grad, gradient_uid)
                logging.vlog(2, "  _AggregatedGrads %d x %s using %s",
                             len(out_grad), tensor_shape, used)
            else:
                out_grads[i] = backprop.aggregate_indexed_slices_gradients(
                    out_grad)  # pylint: disable=protected-access
        else:  # not out_grad
            # out_grads[i] is [], thus its aggregation is simply None.
            out_grads[i] = None
    return out_grads
def entity_to_graph(o, program_ctx, arg_values, arg_types):
    """Compile a Python entity into equivalent TensorFlow.

  The function will also recursively compile all the entities that `o`
  references, updating `dependency_cache`.

  This function is reentrant, and relies on dependency_cache to avoid
  generating duplicate code.

  Args:
    o: A Python entity.
    program_ctx: A ProgramContext object.
    arg_values: A dict containing value hints for symbols like function
        parameters.
    arg_types: A dict containing type hints for symbols like function
        parameters.

  Returns:
    A tuple (ast, new_name, namespace):
        * ast: An AST representing an entity with interface equivalent to `o`,
            but which when executed it creates TF a graph.
        * new_name: The symbol name under which the new entity can be found.
        * namespace: A dict mapping all symbols visible to the converted entity,
            keyed by their symbol name.

  Raises:
    ValueError: if the entity type is not supported.
  """
    logging.vlog(logging.DEBUG, 'Converting %s', o)

    if tf_inspect.isclass(o):
        node, name, ns = class_to_graph(o, program_ctx)
    elif tf_inspect.isfunction(o):
        node, name, ns = function_to_graph(o, program_ctx, arg_values,
                                           arg_types)
    elif tf_inspect.ismethod(o):
        node, name, ns = function_to_graph(o, program_ctx, arg_values,
                                           arg_types)
    # TODO(mdan,yashkatariya): Remove when object conversion is implemented.
    elif hasattr(o, '__class__'):
        raise NotImplementedError(
            'Object conversion is not yet supported. If you are '
            'trying to convert code that uses an existing object, '
            'try including the creation of that object in the '
            'conversion. For example, instead of converting the method '
            'of a class, try converting the entire class instead. '
            'See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/'
            'contrib/autograph/README.md#using-the-functional-api '
            'for more information.')
    else:
        raise ValueError(
            'Entity "%s" has unsupported type "%s". Only functions and classes are '
            'supported for now.' % (o, type(o)))

    # TODO(mdan): This is temporary. it should be created using a converter.
    # TODO(mdan): The attribute should be added with a helper, not directly.
    # The helper can ensure there are no collisions.
    template = '''
      entity.autograph_info__ = {}
  '''
    node.extend(templates.replace(template, entity=name))

    program_ctx.add_to_cache(o, node)

    if logging.get_verbosity() <= logging.DEBUG:
        logging.vlog(logging.DEBUG, 'Compiled output of %s:\n\n%s\n', o,
                     compiler.ast_to_source(node))

    if program_ctx.options.recursive:
        while True:
            candidate = None
            for obj in program_ctx.name_map.keys():
                if obj not in program_ctx.dependency_cache:
                    candidate = obj
                    break
            if candidate is None:
                break
            if (hasattr(candidate, 'im_class') and getattr(
                    candidate, 'im_class') not in program_ctx.partial_types):
                # Class members are converted with their objects, unless they're
                # only converted partially.
                continue
            entity_to_graph(candidate, program_ctx, {}, {})

    return node, name, ns
Beispiel #53
0
def converted_call(f, owner, options, *args, **kwargs):
    """Compiles a function call inline. For internal use only."""
    logging.vlog(logging.DEBUG, 'Converted call: %s; owner: %s', f, owner)

    if owner is not None:
        if not isinstance(f, str):
            raise ValueError(
                'When owner is specified, the function name must be specified as'
                ' a string: {}'.format(f))

        # Special case when the owner is a 'super' object. In that case lookups of
        # dynamic attributes won't work. See
        # inspect_utils.SuperWrapperForDynamicAttrs.
        if isinstance(owner, super):
            owner = inspect_utils.SuperWrapperForDynamicAttrs(owner)

        f = getattr(owner, f)

    if inspect_utils.isbuiltin(f):
        return py_builtins.overload_of(f)(*args, **kwargs)

    # TODO(mdan): This needs cleanup.
    # In particular, we may want to avoid renaming functions altogether.
    if not options.force_conversion and conversion.is_whitelisted_for_graph(f):

        # Args typically include `self`, as required by the conversion process.
        # When conversion is skipped, `self` is not necessary, because the
        # original bound method is being executed. This code removes it.
        if tf_inspect.ismethod(f) and args:
            f_class = inspect_utils.getmethodclass(f)
            if args[0] is f_class:
                args = args[1:]

        return f(*args, **kwargs)

    # internal_convert_user_code is for example turned off when issuing a dynamic
    # call conversion from generated code while in nonrecursive mode. In that
    # case we evidently don't want to recurse, but we still have to convert
    # things like builtins.
    if not options.internal_convert_user_code:
        return f(*args, **kwargs)

    # Unwrap functools.partial objects
    # TODO(mdan): Consider sharing unwrapping logic with tf_inspect.
    while isinstance(f, functools.partial):
        args = f.args + args
        new_kwargs = {}
        if f.keywords is not None:
            new_kwargs.update(f.keywords)
        new_kwargs.update(kwargs)
        kwargs = new_kwargs
        f = f.func

    if tf_inspect.isfunction(f) or tf_inspect.ismethod(f):
        # Regular functions
        target_entity = f
        arg_map_target = f
        f_class = inspect_utils.getmethodclass(f)

        # TODO(b/119246461): This may be more elegantly handled using __get__?
        if f_class is not None:
            # If this is a method call, it may or may not include self.
            #
            # Example when self is included:
            #   converted_call(to_graph(foo.bar), foo)
            #
            # Example when self is not included:
            #   super(...).foo(args)
            #
            if owner is not None and (not args or args[0] is not owner):
                effective_args = (owner, ) + args
            else:
                # When the owner is not specified, use the result of
                # inspect_utils.getmethodclass.
                # TODO(b/119246461): Make sure an owner is always specified.
                if not args or args[0] is not f_class:
                    effective_args = (f_class, ) + args
                else:
                    effective_args = (f_class, ) + args[1:]
            partial_types = (f_class, )
        else:
            effective_args = args
            partial_types = ()

    elif tf_inspect.isclass(f):
        # Constructors
        target_entity = f
        arg_map_target = f.__init__
        effective_args = args
        partial_types = ()

    elif hasattr(f, '__call__') and hasattr(f, '__class__'):
        # Callable objects
        target_entity = f.__call__
        arg_map_target = f.__call__
        effective_args = (f, ) + args
        partial_types = (f.__class__, )

    else:
        NotImplementedError('unknown callable type "%s"' % type(f))

    arg_values = tf_inspect.getcallargs(arg_map_target, *args, **kwargs)
    arg_types = {}
    for name, arg in arg_values.items():
        arg_class = arg.__class__
        arg_types[name] = (arg_class.__name__, arg_class)

    # When called from within a decorator, this is the only indication that
    # the function is a method - it appears that the decorator is applied
    # before the method is bound.
    if not partial_types:
        if 'self' in arg_values:
            if tf_inspect.isclass(arg_values['self'].__class__):
                partial_types = (arg_values['self'].__class__, )
        elif 'cls' in arg_values:
            if tf_inspect.isclass(arg_values['cls']):
                partial_types = (arg_values['cls'], )

    converted_f = to_graph(
        target_entity,
        recursive=options.recursive,
        arg_values=arg_values,
        arg_types=arg_types,
        experimental_optional_features=options.optional_features,
        experimental_strip_decorators=options.strip_decorators,
        experimental_verbose=options.verbose,
        experimental_partial_types=partial_types)

    result = converted_f(*effective_args, **kwargs)

    # The converted function's closure is simply inserted into the function's
    # module __dict__. Since modules are permanently cached, that results in
    # leaking the entire closure.
    # Normally, it's not safe to delete the module because that may release said
    # closure as well. However, in the case of converted_call we are certain the
    # function will not be executed again, so the closure should no longer be
    # needed so long as the function doesn't return any executable code.
    # TODO(mdan): Attach the closure properly, using cells.
    if all(map(_is_not_callable, nest.flatten(result))):
        del sys.modules[converted_f.__module__]

    return result
def is_whitelisted_for_graph(o):
    """Check whether an entity is whitelisted for use in graph mode.

  Examples of whitelisted entities include all members of the tensorflow
  package.

  Args:
    o: A Python entity.
  Returns:
    Boolean
  """
    # TODO(b/120224672): Fix this.
    if isinstance(o, functools.partial):
        # tf_inspect.getmodule(functools.partial(...)) otherwise returns None since
        # functools.partial objects do not have a __module__ attribute.
        m = functools
    else:
        m = tf_inspect.getmodule(o)
    if not hasattr(m, '__name__'):
        logging.vlog(1, '%s is NOT whitelisted for graph: unknown module name',
                     o)
        return False

    for prefix, in config.DEFAULT_UNCOMPILED_MODULES:
        if m.__name__.startswith(prefix):
            logging.vlog(1, '%s is whitelisted: name starts with "%s"', o,
                         prefix)
            return True

    if hasattr(o, 'autograph_info__'):
        return True

    if (not inspect_utils.isweakrefself(o) and not tf_inspect.isclass(o)
            and hasattr(o, '__call__') and hasattr(o, '__class__')):
        # Callable objects: whitelisted if their __call__ method is.
        retval = is_whitelisted_for_graph(o.__call__)
        logging.vlog(1, '%s is whitelisted: object __call__ whitelisted', o)
        return retval

    if tf_inspect.ismethod(o):
        # Methods of whitelisted classes are also whitelisted, even if they are
        # bound via user subclasses.
        #
        # For example, suppose `tf.Foo` has a method called `bar`, and `baz` is
        # defined as below. `tf.Foo` is whitelisted. Then `baz.bar` is also
        # whitelisted.
        #
        #   class Custom(tf.Foo):
        #     pass
        #
        #   baz = Custom()
        #
        # For the example above, if `Custom` did overload `bar`, then it would no
        # longer be whitelisted.

        owner_class = inspect_utils.getmethodclass(o)
        if owner_class is not None:
            owner_class = inspect_utils.getdefiningclass(o, owner_class)
            if is_whitelisted_for_graph(owner_class):
                logging.vlog(1, '%s is whitelisted: owner is whitelisted %s',
                             o, owner_class)
                return True

    if inspect_utils.isnamedtuple(o):
        # Due to the way they're constructed, namedtuple types cannot be converted
        # because they don't expose source code. But we assume they are safe for
        # graph mode since they are just containers.
        if tf_inspect.isclass(o) and len(o.__bases__) > 1:
            logging.log_first_n(
                logging.level_warning(),
                'Entity {} looks like a namedtuple subclass. If it has any custom'
                ' methods, they will not be converted by AutoGraph.'.format(o),
                1)
        logging.vlog(1, '%s is whitelisted: named tuple', o)
        return True

    logging.vlog(1, '%s is NOT whitelisted for graph', o)
    return False
Beispiel #55
0
    def gradient(self,
                 target,
                 sources,
                 output_gradients=None,
                 unconnected_gradients=UnconnectedGradients.NONE):
        """Computes the gradient using operations recorded in context of this tape.

    Args:
      target: a list or nested structure of Tensors or Variables to be
        differentiated.
      sources: a list or nested structure of Tensors or Variables. `target`
        will be differentiated against elements in `sources`.
      output_gradients: a list of gradients, one for each element of
        target. Defaults to None.
      unconnected_gradients: a value which can either hold 'none' or 'zero' and
        alters the value which will be returned if the target and sources are
        unconnected. The possible values and effects are detailed in
        'UnconnectedGradients' and it defaults to 'none'.

    Returns:
      a list or nested structure of Tensors (or IndexedSlices, or None),
      one for each element in `sources`. Returned structure is the same as
      the structure of `sources`.

    Raises:
      RuntimeError: if called inside the context of the tape, or if called more
       than once on a non-persistent tape.
      ValueError: if the target is a variable or if unconnected gradients is
       called with an unknown value.
    """
        if self._tape is None:
            raise RuntimeError(
                "GradientTape.gradient can only be called once on "
                "non-persistent tapes.")
        if self._recording:
            if not self._persistent:
                self._pop_tape()
            else:
                logging.log_first_n(
                    logging.WARN,
                    "Calling GradientTape.gradient on a persistent "
                    "tape inside its context is significantly less "
                    "efficient than calling it outside the context (it "
                    "causes the gradient ops to be recorded on the "
                    "tape, leading to increased CPU and memory usage). "
                    "Only call GradientTape.gradient inside the "
                    "context if you actually want to trace the "
                    "gradient in order to compute higher order "
                    "derivatives.", 1)

        flat_targets = []
        for t in nest.flatten(target):
            if not backprop_util.IsTrainable(t):
                logging.vlog(
                    logging.WARN, "The dtype of the target tensor must be "
                    "floating (e.g. tf.float32) when calling GradientTape.gradient, "
                    "got %r", t.dtype)
            if resource_variable_ops.is_resource_variable(t):
                with self:
                    t = ops.convert_to_tensor(t)
            flat_targets.append(t)

        flat_sources = nest.flatten(sources)
        flat_sources_raw = flat_sources
        flat_sources = [_handle_or_self(x) for x in flat_sources]
        for t in flat_sources_raw:
            if not backprop_util.IsTrainable(t):
                logging.vlog(
                    logging.WARN, "The dtype of the source tensor must be "
                    "floating (e.g. tf.float32) when calling GradientTape.gradient, "
                    "got %r", t.dtype)

        if output_gradients is not None:
            output_gradients = [
                None if x is None else ops.convert_to_tensor(x)
                for x in nest.flatten(output_gradients)
            ]

        flat_grad = imperative_grad.imperative_grad(
            self._tape,
            flat_targets,
            flat_sources,
            output_gradients=output_gradients,
            sources_raw=flat_sources_raw,
            unconnected_gradients=unconnected_gradients)

        if not self._persistent:
            self._tape = None

        grad = nest.pack_sequence_as(sources, flat_grad)
        return grad
Beispiel #56
0
def _build_all_reduce_ring(core_locations: List[_CoreLocation],
                           rotate: bool = False) -> List[int]:
  """Reorders a list of TPU cores to optimize for AllReduce performance.

  This is ported from the C++ tensorflow::BuildAllReduceRing function,
  mixed with some logic from TF TPU's device_assignment._ring_3d.

  Args:
    core_locations: A list of core locations expressed as [x, y, z, core].
    rotate: If true, scan the cores in a column-major order. False by default.

  Returns:
    A permutation of the input list such that neighbors in the sequence are
    nearby in the TPU topology.
  """

  permutation = list(range(len(core_locations)))
  if not permutation:
    return permutation
  logging.vlog(2, "Core locations in: %s", core_locations)

  first_column = min([l.x for l in core_locations])
  first_row = min([l.y for l in core_locations])
  same_z = (len(set([l.z for l in core_locations])) == 1)
  logging.vlog(2, "first_column: %d", first_column)
  logging.vlog(2, "first_row: %d", first_row)
  logging.vlog(2, "same_z: %s", same_z)

  def _cmp_2d(ia: int, ib: int) -> int:
    if not rotate:
      a = core_locations[ia]
      b = core_locations[ib]

      # Order the first column last in the sequence, except for the first row.
      a_first = (a.x == first_column and a.y != first_row)
      b_first = (b.x == first_column and b.y != first_row)
      if a_first != b_first:
        return -1 if b_first else 1

      # Order rows in increasing order, unless in the first column.
      if a.y != b.y:
        return b.y - a.y if a_first else a.y - b.y

      # Order even rows left to right, odd rows right to left.
      if a.x != b.x:
        return a.x - b.x if a.y % 2 == 0 else b.x - a.x

      # Order cores in increasing order.
      return a.core - b.core
    else:
      a = core_locations[ia]
      b = core_locations[ib]

      # Order the first row last in the sequence, except for the first column.
      a_first = (a.y == first_row and a.x != first_column)
      b_first = (b.y == first_row and b.x != first_column)
      if a_first != b_first:
        return -1 if b_first else 1

      # Order columns in increasing order, unless in the first row.
      if a.x != b.x:
        return b.x - a.x if a_first else a.x - b.x

      # Order even columns top down, odd columns bottom up.
      if a.y != b.y:
        return a.y - b.y if a.x % 2 == 0 else b.y - a.y

      # Order cores in increasing order.
      return a.core - b.core

  def _cmp_3d(ia: int, ib: int) -> int:
    a = core_locations[ia]
    b = core_locations[ib]

    a_corner = (a.x == first_column and a.y == first_row)
    b_corner = (b.x == first_column and b.y == first_row)

    # If both are in the corner, order in reverse z then core order.
    if a_corner and b_corner:
      return b.z - a.z if a.z != b.z else a.core - b.core

    # Corner cores always go after non-corner cores.
    if a_corner != b_corner:
      return -1 if b_corner else 1

    # Both non-corner cores are on the same z-plane. Reverse odd z-planes.
    if a.z == b.z:
      return _cmp_2d(ia, ib) if a.z % 2 == 0 else -_cmp_2d(ia, ib)

    # Both non-corner cores are on different z-planes. Smaller z goes first.
    return a.z - b.z

  # If all cores are on the same z-plane, order as usual. Otherwise, order
  # neighbor z-planes in opposite orders. Stack all z-planes along the z axis
  # and connect them in one corner.
  if same_z:
    permutation.sort(key=functools.cmp_to_key(_cmp_2d))
  else:
    permutation.sort(key=functools.cmp_to_key(_cmp_3d))
  logging.vlog(2, "Permutation out: %s", permutation)
  return permutation
Beispiel #57
0
    def _run(self, sess, enqueue_op, feed_fn, coord=None):
        """Execute the enqueue op in a loop, close the queue in case of error.

        Args:
          sess: A `Session`.
          enqueue_op: The `Operation` to run.
          feed_fn: the feed function to pass to `sess.run`.
          coord: Optional `Coordinator` object for reporting errors and checking
            for stop conditions.

        """
        if coord:
            coord.register_thread(threading.current_thread())

        waitempty = self._waitempty
        decremented = False

        try:
            while True:
                if coord and coord.should_stop():
                    break
                try:
                    try:
                        # NOTE: @dade if generator stop wait during consuming remained data
                        feed_dict = None if feed_fn is None else feed_fn()
                        # enqueue data
                        sess.run(enqueue_op, feed_dict=feed_dict)
                    except StopIteration:
                        if coord and waitempty:
                            # wait for dequeueing
                            while not coord.should_stop():
                                # with self._lock:
                                if sess.run(self.queue.size()) == 0:
                                    raise StopIteration
                        raise StopIteration

                except (errors.OutOfRangeError, errors.CancelledError,
                        StopIteration):
                    # This exception indicates that a queue was closed.
                    with self._lock:
                        self._runs_per_session[sess] -= 1
                        decremented = True
                        if self._runs_per_session[sess] == 0:
                            try:
                                sess.run(self._close_op)
                            except Exception as e:
                                # Intentionally ignore errors from close_op.
                                logging.vlog(1, "Ignored exception: %s",
                                             str(e))
                        return
        except Exception as e:
            # This catches all other exceptions.
            if coord:
                coord.request_stop(e)
            else:
                logging.error("Exception in QueueRunner: %s", str(e))
                with self._lock:
                    self._exceptions_raised.append(e)
                raise
        finally:
            # Make sure we account for all terminations: normal or errors.
            if not decremented:
                with self._lock:
                    self._runs_per_session[sess] -= 1
Beispiel #58
0
def type_spec_from_value(element, use_fallback=True):
    """Creates a type specification for the given value.

  Args:
    element: The element to create the type specification for.
    use_fallback: Whether to fall back to converting the element to a tensor
      in order to compute its `TypeSpec`.

  Returns:
    A nested structure of `TypeSpec`s that represents the type specification
    of `element`.

  Raises:
    TypeError: If a `TypeSpec` cannot be built for `element`, because its type
      is not supported.
  """
    spec = type_spec._type_spec_from_value(element)  # pylint: disable=protected-access
    if spec is not None:
        return spec

    if isinstance(element, collections_abc.Mapping):
        # We create a shallow copy in an attempt to preserve the key order.
        #
        # Note that we do not guarantee that the key order is preserved, which is
        # a limitation inherited from `copy()`. As a consequence, callers of
        # `type_spec_from_value` should not assume that the key order of a `dict`
        # in the returned nested structure matches the key order of the
        # corresponding `dict` in the input value.
        if isinstance(element, collections.defaultdict):
            ctor = lambda items: type(element)(element.default_factory, items)
        else:
            ctor = type(element)
        return ctor([(k, type_spec_from_value(v)) for k, v in element.items()])

    if isinstance(element, tuple):
        if hasattr(element, "_fields") and isinstance(
                element._fields, collections_abc.Sequence) and all(
                    isinstance(f, six.string_types) for f in element._fields):
            if isinstance(element, wrapt.ObjectProxy):
                element_type = type(element.__wrapped__)
            else:
                element_type = type(element)
            # `element` is a namedtuple
            return element_type(*[type_spec_from_value(v) for v in element])
        # `element` is not a namedtuple
        return tuple([type_spec_from_value(v) for v in element])

    if use_fallback:
        # As a fallback try converting the element to a tensor.
        try:
            tensor = ops.convert_to_tensor(element)
            spec = type_spec_from_value(tensor)
            if spec is not None:
                return spec
        except (ValueError, TypeError) as e:
            logging.vlog(
                3, "Failed to convert %r to tensor: %s" %
                (type(element).__name__, e))

    raise TypeError("Could not build a TypeSpec for %r with type %s" %
                    (element, type(element).__name__))
Beispiel #59
0
def _AggregatedGrads(grads, op, loop_state, aggregation_method=None):
  """Get the aggregated gradients for op.

  Args:
    grads: The map of memoized gradients.
    op: The op to get gradients for.
    loop_state: An object for maintaining the state of the while loops in the
                graph. It is of type ControlFlowState. None if the graph
                contains no while loops.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.

  Returns:
    A list of gradients, one per each output of `op`. If the gradients
      for a particular output is a list, this function aggregates it
      before returning.

  Raises:
    TypeError: if the incoming grads are not Tensors or IndexedSlices.
    ValueError: if the arguments are invalid.

  """
  if aggregation_method is None:
    aggregation_method = AggregationMethod.DEFAULT
  if aggregation_method not in [
      AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE,
      AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
  ]:
    raise ValueError("Invalid aggregation_method specified %s." %
                     aggregation_method)
  out_grads = _GetGrads(grads, op)
  for i, out_grad in enumerate(out_grads):
    if loop_state:
      if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
        assert control_flow_ops.IsLoopSwitch(op)
        continue
    # Grads have to be Tensors or IndexedSlices
    if (isinstance(out_grad, collections.Sequence) and not all([
        isinstance(g, (ops.Tensor, ops.IndexedSlices)) for g in out_grad
        if g is not None
    ])):
      raise TypeError("gradients have to be either all Tensors "
                      "or all IndexedSlices")
    # Aggregate multiple gradients, and convert [] to None.
    if out_grad:
      if len(out_grad) < 2:
        used = "nop"
        out_grads[i] = out_grad[0]
      elif all([isinstance(g, ops.Tensor) for g in out_grad if g is not None]):
        tensor_shape = _AccumulatorShape(out_grad)
        if (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
            and len(out_grad) > 2 and tensor_shape.is_fully_defined()):
          # The benefit of using AccumulateN is that its inputs can be combined
          # in any order and this can allow the expression to be evaluated with
          # a smaller memory footprint.  When used with gpu_allocator_retry,
          # it is possible to compute a sum of terms which are much larger than
          # total GPU memory.
          # AccumulateN can currently only be used if we know the shape for
          # an accumulator variable.  If this is not known, or if we only have
          # 2 grads then we fall through to the "tree" case below.
          used = "accumulate_n"
          out_grads[i] = math_ops.accumulate_n(out_grad)
        elif aggregation_method in [
            AggregationMethod.EXPERIMENTAL_TREE,
            AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
        ]:
          # Aggregate all gradients by doing pairwise sums: this may
          # reduce performance, but it can improve memory because the
          # gradients can be released earlier.
          #
          # TODO(vrv): Consider replacing this with a version of
          # tf.AddN() that eagerly frees its inputs as soon as they are
          # ready, so the order of this tree does not become a problem.
          used = "tree"
          with ops.name_scope(op.name + "_gradient_sum"):
            running_sum = out_grad[0]
            for grad in out_grad[1:]:
              running_sum = math_ops.add_n([running_sum, grad])
            out_grads[i] = running_sum
        else:
          used = "add_n"
          out_grads[i] = _MultiDeviceAddN(out_grad)
        logging.vlog(2, "  _AggregatedGrads %d x %s using %s",
                     len(out_grad), tensor_shape, used)
      else:
        out_grad = math_ops._as_indexed_slices_list(
            [g for g in out_grad if g is not None])
        out_grad = [_HandleNestedIndexedSlices(x) for x in out_grad]
        # Form IndexedSlices out of the concatenated values and
        # indices.
        out_grads[i] = ops.IndexedSlices(
            array_ops.concat_v2([x.values for x in out_grad], 0),
            array_ops.concat_v2([x.indices for x in out_grad], 0),
            out_grad[0].dense_shape)
    else:
      out_grads[i] = []
  return out_grads
Beispiel #60
0
def _get_layer_params(node, name_to_order_node):
    """Gets layer parameters relevant for RF computation.

  Currently, only these nodes are supported:
  - Conv2D
  - DepthwiseConv2dNative
  - Pad
  - MaxPool
  - AvgPool
  - all nodes listed in _UNCHANGED_RF_LAYER_OPS

  Args:
    node: Tensorflow node (NodeDef proto).
    name_to_order_node: Map from name to {order, node}. Output of
      graph_compute_order.get_compute_order().

  Returns:
    kernel_size_x: Kernel size for horizontal direction (integer).
    kernel_size_y: Kernel size for vertical direction (integer).
    stride_x: Stride size for horizontal direction (integer).
    stride_y: Stride size for vertical direction (integer).
    padding_x: Padding size for horizontal direction (integer).
    padding_y: Padding size for vertical direction (integer).

  Raises:
    ValueError: If layer op is unknown.
  """
    logging.vlog(3, "node.op = %s", node.op)
    logging.vlog(4, "node = %s", node)
    if node.op == "Conv2D" or node.op == "DepthwiseConv2dNative":
        stride_x, stride_y = _stride_size(node)
        kernel_size_x, kernel_size_y = _conv_kernel_size(
            node, name_to_order_node)
        # Compute the padding for this node separately for each direction.
        padding_x = _padding_size_conv_pool(node, kernel_size_x, stride_x)
        padding_y = _padding_size_conv_pool(node, kernel_size_y, stride_y)
    elif node.op == "Pad":
        # Kernel and stride are simply 1 in this case.
        kernel_size_x = 1
        kernel_size_y = 1
        stride_x = 1
        stride_y = 1
        padding_x, padding_y = _padding_size_pad_layer(node,
                                                       name_to_order_node)
    elif node.op == "MaxPool" or node.op == "AvgPool":
        stride_x, stride_y = _stride_size(node)
        kernel_size_x, kernel_size_y = _pool_kernel_size(node)
        # Compute the padding for this node separately for each direction.
        padding_x = _padding_size_conv_pool(node, kernel_size_x, stride_x)
        padding_y = _padding_size_conv_pool(node, kernel_size_y, stride_y)
    elif node.op in _UNCHANGED_RF_LAYER_OPS:
        # These nodes do not modify the RF parameters.
        kernel_size_x = 1
        kernel_size_y = 1
        stride_x = 1
        stride_y = 1
        padding_x = 0
        padding_y = 0
    else:
        raise ValueError("Unknown layer op: %s" % node.op)
    return kernel_size_x, kernel_size_y, stride_x, stride_y, padding_x, padding_y