Beispiel #1
0
  def replace(self, episodes, length, rows=None):
    """Replace full episodes.

    Args:
      episodes: Tuple of transition quantities with batch and time dimensions.
      length: Batch of sequence lengths.
      rows: Episodes to replace, defaults to all.

    Returns:
      Operation.
    """
    rows = tf.range(self._capacity) if rows is None else rows
    assert rows.shape.ndims == 1
    assert_capacity = tf.assert_less(
        rows, self._capacity, message='capacity exceeded')
    with tf.control_dependencies([assert_capacity]):
      assert_max_length = tf.assert_less_equal(
          length, self._max_length, message='max length exceeded')
    replace_ops = []
    with tf.control_dependencies([assert_max_length]):
      for buffer_, elements in zip(self._buffers, episodes):
        replace_op = tf.scatter_update(buffer_, rows, elements)
        replace_ops.append(replace_op)
    with tf.control_dependencies(replace_ops):
      return tf.scatter_update(self._length, rows, length)
Beispiel #2
0
def preprocess_for_inception(images):
  """Preprocess images for inception.

  Args:
    images: images minibatch. Shape [batch size, width, height,
      channels]. Values are in [0..255].

  Returns:
    preprocessed_images
  """

  # Images should have 3 channels.
  assert images.shape[3].value == 3

  # tfgan_eval.preprocess_image function takes values in [0, 1], so rescale.
  with tf.control_dependencies([tf.assert_greater_equal(images, 0.0),
                                tf.assert_less_equal(images, 255.0)]):
    images = tf.identity(images)

  preprocessed_images = tf.map_fn(
      fn=tfgan_eval.preprocess_image,
      elems=images,
      back_prop=False
  )

  return preprocessed_images
Beispiel #3
0
def new_mean_squared(grad_vec, decay, ms):
  """Calculates the new accumulated mean squared of the gradient.

  Args:
    grad_vec: the vector for the current gradient
    decay: the decay term
    ms: the previous mean_squared value

  Returns:
    the new mean_squared value
  """
  decay_size = decay.get_shape().num_elements()
  decay_check_ops = [
      tf.assert_less_equal(decay, 1., summarize=decay_size),
      tf.assert_greater_equal(decay, 0., summarize=decay_size)]

  with tf.control_dependencies(decay_check_ops):
    grad_squared = tf.square(grad_vec)

  # If the previous mean_squared is the 0 vector, don't use the decay and just
  # return the full grad_squared. This should only happen on the first timestep.
  decay = tf.cond(tf.reduce_all(tf.equal(ms, 0.)),
                  lambda: tf.zeros_like(decay, dtype=tf.float32), lambda: decay)

  # Update the running average of squared gradients.
  epsilon = 1e-12
  return (1. - decay) * (grad_squared + epsilon) + decay * ms
Beispiel #4
0
def calculate_reshape(original_shape, new_shape, validate=False, name=None):
  """Calculates the reshaped dimensions (replacing up to one -1 in reshape)."""
  batch_shape_static = tensor_util.constant_value_as_shape(new_shape)
  if batch_shape_static.is_fully_defined():
    return np.int32(batch_shape_static.as_list()), batch_shape_static, []
  with tf.name_scope(name, "calculate_reshape", [original_shape, new_shape]):
    original_size = tf.reduce_prod(original_shape)
    implicit_dim = tf.equal(new_shape, -1)
    size_implicit_dim = (
        original_size // tf.maximum(1, -tf.reduce_prod(new_shape)))
    new_ndims = tf.shape(new_shape)
    expanded_new_shape = tf.where(  # Assumes exactly one `-1`.
        implicit_dim, tf.fill(new_ndims, size_implicit_dim), new_shape)
    validations = [] if not validate else [
        tf.assert_rank(
            original_shape, 1, message="Original shape must be a vector."),
        tf.assert_rank(new_shape, 1, message="New shape must be a vector."),
        tf.assert_less_equal(
            tf.count_nonzero(implicit_dim, dtype=tf.int32),
            1,
            message="At most one dimension can be unknown."),
        tf.assert_positive(
            expanded_new_shape, message="Shape elements must be >=-1."),
        tf.assert_equal(
            tf.reduce_prod(expanded_new_shape),
            original_size,
            message="Shape sizes do not match."),
    ]
    return expanded_new_shape, batch_shape_static, validations
Beispiel #5
0
 def test_doesnt_raise_when_both_empty(self):
   with self.test_session():
     larry = tf.constant([])
     curly = tf.constant([])
     with tf.control_dependencies([tf.assert_less_equal(larry, curly)]):
       out = tf.identity(larry)
     out.eval()
Beispiel #6
0
 def test_doesnt_raise_when_less_equal_and_broadcastable_shapes(self):
   with self.test_session():
     small = tf.constant([1], name="small")
     big = tf.constant([3, 1], name="big")
     with tf.control_dependencies([tf.assert_less_equal(small, big)]):
       out = tf.identity(small)
     out.eval()
Beispiel #7
0
  def _maybe_check_valid_shape(self, shape, validate_args):
    """Check that a shape Tensor is int-type and otherwise sane."""
    if not shape.dtype.is_integer:
      raise TypeError('{} dtype ({}) should be `int`-like.'.format(
          shape, shape.dtype.name))

    assertions = []

    ndims = tf.rank(shape)
    ndims_ = tensor_util.constant_value(ndims)
    if ndims_ is not None and ndims_ > 1:
      raise ValueError('`{}` rank ({}) should be <= 1.'.format(
          shape, ndims_))
    elif validate_args:
      assertions.append(
          tf.assert_less_equal(
              ndims, 1, message='`{}` rank should be <= 1.'.format(shape)))

    # Note, we might be inclined to use tensor_util.constant_value_as_shape
    # here, but that method coerces negative values into `None`s, rendering the
    # checks we do below impossible.
    shape_tensor_ = tensor_util.constant_value(shape)
    if shape_tensor_ is not None:
      es = np.int32(shape_tensor_)
      if sum(es == -1) > 1:
        raise ValueError(
            '`{}` must have at most one `-1` (given {})'
            .format(shape, es))
      if np.any(es < -1):
        raise ValueError(
            '`{}` elements must be either positive integers or `-1`'
            '(given {}).'
            .format(shape, es))
    elif validate_args:
      assertions.extend([
          tf.assert_less_equal(
              tf.reduce_sum(tf.cast(tf.equal(shape, -1), tf.int32)),
              1,
              message='`{}` elements must have at most one `-1`.'
              .format(shape)),
          tf.assert_greater_equal(
              shape,
              -1,
              message='`{}` elements must be either positive integers or `-1`.'
              .format(shape)),
      ])
    return assertions
Beispiel #8
0
 def test_raises_when_greater(self):
   with self.test_session():
     small = tf.constant([1, 2], name="small")
     big = tf.constant([3, 4], name="big")
     with tf.control_dependencies([tf.assert_less_equal(big, small)]):
       out = tf.identity(small)
     with self.assertRaisesOpError("big.*small"):
       out.eval()
Beispiel #9
0
def remidify(pitches):
  """Transforms [0, 88) to MIDI pitches [21, 108]."""
  assertions = [
      tf.assert_greater_equal(pitches, 0),
      tf.assert_less_equal(pitches, 87)
  ]
  with tf.control_dependencies(assertions):
    return pitches + 21
Beispiel #10
0
def demidify(pitches):
  """Transforms MIDI pitches [21,108] to [0, 88)."""
  assertions = [
      tf.assert_greater_equal(pitches, 21),
      tf.assert_less_equal(pitches, 108)
  ]
  with tf.control_dependencies(assertions):
    return pitches - 21
Beispiel #11
0
  def _augment_data(self, inout, nchan=6):
    """Flip, crop and rotate samples randomly."""

    with tf.name_scope('data_augmentation'):
      if self.fliplr:
        inout = tf.image.random_flip_left_right(inout, seed=1234)
      if self.flipud:
        inout = tf.image.random_flip_up_down(inout, seed=3456)
      if self.rotate:
        angle = tf.random_uniform((), minval=0, maxval=4, dtype=tf.int32, seed=4567)
        inout = tf.case([(tf.equal(angle, 1), lambda: tf.image.rot90(inout, k=1)),
                         (tf.equal(angle, 2), lambda: tf.image.rot90(inout, k=2)),
                         (tf.equal(angle, 3), lambda: tf.image.rot90(inout, k=3))],
                        lambda: inout)

      inout.set_shape([None, None, nchan])

      with tf.name_scope('crop'):
        shape = tf.shape(inout)
        new_height = tf.to_int32(self.output_resolution[0])
        new_width = tf.to_int32(self.output_resolution[1])
        height_ok = tf.assert_less_equal(new_height, shape[0])
        width_ok = tf.assert_less_equal(new_width, shape[1])
        with tf.control_dependencies([height_ok, width_ok]):
          if self.random_crop:
            inout = tf.random_crop(
                inout, tf.stack([new_height, new_width, nchan]))
          else:
            height_offset = tf.to_int32((shape[0]-new_height)/2)
            width_offset = tf.to_int32((shape[1]-new_width)/2)
            inout = tf.image.crop_to_bounding_box(
                inout, height_offset, width_offset,
                new_height, new_width)

      inout.set_shape([None, None, nchan])
      inout = tf.image.resize_images(
          inout, [self.output_resolution[0], self.output_resolution[1]])
      fullres = inout

      with tf.name_scope('resize'):
        new_size = 256
        inout = tf.image.resize_images(
            inout, [new_size, new_size],
            method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)

      return fullres, inout
Beispiel #12
0
 def test_raises_when_less_equal_but_non_broadcastable_shapes(self):
   with self.test_session():
     small = tf.constant([1, 1, 1], name="small")
     big = tf.constant([3, 1], name="big")
     with self.assertRaisesRegexp(ValueError, "broadcast"):
       with tf.control_dependencies([tf.assert_less_equal(small, big)]):
         out = tf.identity(small)
       out.eval()
Beispiel #13
0
def scale_to_inception_range(image):
  """Scales an image in the range [0,1] to [-1,1] as expected by inception."""
  # Assert that incoming images have been properly scaled to [0,1].
  with tf.control_dependencies(
      [tf.assert_less_equal(tf.reduce_max(image), 1.),
       tf.assert_greater_equal(tf.reduce_min(image), 0.)]):
    image = tf.subtract(image, 0.5)
    image = tf.multiply(image, 2.0)
    return image
Beispiel #14
0
 def _maybe_assert_valid_y(self, y):
   if not self.validate_args:
     return y
   is_positive = tf.assert_non_negative(
       y, message="Inverse transformation input must be greater than 0.")
   less_than_one = tf.assert_less_equal(
       y,
       tf.constant(1., y.dtype),
       message="Inverse transformation input must be less than or equal to 1.")
   return control_flow_ops.with_dependencies([is_positive, less_than_one], y)
Beispiel #15
0
 def _maybe_assert_valid(self, x):
   if not self.validate_args:
     return x
   return control_flow_ops.with_dependencies([
       tf.assert_non_negative(x, message="sample must be non-negative"),
       tf.assert_less_equal(
           x,
           tf.ones([], self.concentration0.dtype),
           message="sample must be no larger than `1`."),
   ], x)
Beispiel #16
0
 def _maybe_assert_valid_sample(self, counts):
   """Check counts for proper shape, values, then return tensor version."""
   if not self.validate_args:
     return counts
   counts = distribution_util.embed_check_nonnegative_integer_form(counts)
   return control_flow_ops.with_dependencies([
       tf.assert_less_equal(
           counts,
           self.total_count,
           message="counts are not less than or equal to n."),
   ], counts)
Beispiel #17
0
 def _validate_correlationness(self, x):
   if not self.validate_args:
     return x
   checks = [
       tf.assert_less_equal(
           tf.cast(-1., dtype=x.dtype.base_dtype),
           x,
           message='Correlations must be >= -1.'),
       tf.assert_less_equal(
           x,
           tf.cast(1., x.dtype.base_dtype),
           message='Correlations must be <= 1.'),
       tf.assert_near(
           tf.matrix_diag_part(x),
           tf.cast(1., x.dtype.base_dtype),
           message='Self-correlations must be = 1.'),
       tf.assert_near(
           x, tf.matrix_transpose(x),
           message='Correlation matrices must be symmetric')
   ]
   with tf.control_dependencies(checks):
     return tf.identity(x)
Beispiel #18
0
def maybe_split_sequence_lengths(sequence_length, num_splits, total_length):
  """Validates and splits `sequence_length`, if necessary.

  Returned value must be used in graph for all validations to be executed.

  Args:
    sequence_length: A batch of sequence lengths, either sized `[batch_size]`
      and equal to either 0 or `total_length`, or sized
      `[batch_size, num_splits]`.
    num_splits: The scalar number of splits of the full sequences.
    total_length: The scalar total sequence length (potentially padded).

  Returns:
    sequence_length: If input shape was `[batch_size, num_splits]`, returns the
      same Tensor. Otherwise, returns a Tensor of that shape with each input
      length in the batch divided by `num_splits`.
  Raises:
    ValueError: If `sequence_length` is not shaped `[batch_size]` or
      `[batch_size, num_splits]`.
    tf.errors.InvalidArgumentError: If `sequence_length` is shaped
      `[batch_size]` and all values are not either 0 or `total_length`.
  """
  if sequence_length.shape.ndims == 1:
    if total_length % num_splits != 0:
      raise ValueError(
          '`total_length` must be evenly divisible by `num_splits`.')
    with tf.control_dependencies(
        [tf.Assert(
            tf.reduce_all(
                tf.logical_or(tf.equal(sequence_length, 0),
                              tf.equal(sequence_length, total_length))),
            data=[sequence_length])]):
      sequence_length = (
          tf.tile(tf.expand_dims(sequence_length, axis=1), [1, num_splits]) //
          num_splits)
  elif sequence_length.shape.ndims == 2:
    with tf.control_dependencies([
        tf.assert_less_equal(
            sequence_length,
            tf.constant(total_length // num_splits, tf.int32),
            message='Segment length cannot be more than '
                    '`total_length / num_splits`.')]):
      sequence_length = tf.identity(sequence_length)
    sequence_length.set_shape([sequence_length.shape[0], num_splits])
  else:
    raise ValueError(
        'Sequence lengths must be given as a vector or a 2D Tensor whose '
        'second dimension size matches its initial hierarchical split. Got '
        'shape: %s' % sequence_length.shape.as_list())
  return sequence_length
def _maximum_mean(samples, envelope, high, name=None):
  """Returns a stochastic upper bound on the mean of a scalar distribution.

  The idea is that if the true CDF is within an `eps`-envelope of the
  empirical CDF of the samples, and the support is bounded above, then
  the mean is bounded above as well.  In symbols,

  ```none
  sup_x(|F_n(x) - F(x)|) < eps
  ```

  The 0th dimension of `samples` is interpreted as independent and
  identically distributed samples.  The remaining dimensions are
  broadcast together with `envelope` and `high`, and operated on
  separately.

  Args:
    samples: Floating-point `Tensor` of samples from the distribution(s)
      of interest.  Entries are assumed IID across the 0th dimension.
      The other dimensions must broadcast with `envelope` and `high`.
    envelope: Floating-point `Tensor` of sizes of admissible CDF
      envelopes (i.e., the `eps` above).
    high: Floating-point `Tensor` of upper bounds on the distributions'
      supports.  `samples <= high`.
    name: A name for this operation (optional).

  Returns:
    bound: Floating-point `Tensor` of upper bounds on the true means.

  Raises:
    InvalidArgumentError: If some `sample` is found to be larger than
      the corresponding `high`.
  """
  with tf.name_scope(name, "maximum_mean", [samples, envelope, high]):
    dtype = dtype_util.common_dtype([samples, envelope, high], tf.float32)
    samples = tf.convert_to_tensor(samples, name="samples", dtype=dtype)
    envelope = tf.convert_to_tensor(envelope, name="envelope", dtype=dtype)
    high = tf.convert_to_tensor(high, name="high", dtype=dtype)

    xmax = tf.reduce_max(samples, axis=[0])
    msg = "Given sample maximum value exceeds expectations"
    check_op = tf.assert_less_equal(xmax, high, message=msg)
    with tf.control_dependencies([check_op]):
      return tf.identity(_do_maximum_mean(samples, envelope, high))
Beispiel #20
0
  def _init_clusters_random(self):
    """Does random initialization of clusters.

    Returns:
      Tensor of randomly initialized clusters.
    """
    num_data = tf.add_n([tf.shape(inp)[0] for inp in self._inputs])
    # Note that for mini-batch k-means, we should ensure that the batch size of
    # data used during initialization is sufficiently large to avoid duplicated
    # clusters.
    with tf.control_dependencies(
        [tf.assert_less_equal(self._num_clusters, num_data)]):
      indices = tf.random_uniform(tf.reshape(self._num_clusters, [-1]),
                                  minval=0,
                                  maxval=tf.cast(num_data, tf.int64),
                                  seed=self._random_seed,
                                  dtype=tf.int64)
      clusters_init = embedding_lookup(self._inputs, indices,
                                       partition_strategy='div')
      return clusters_init
Beispiel #21
0
def _init_clusters_random(data, num_clusters, random_seed):
  """Does random initialization of clusters.

  Args:
    data: a list of Tensors with a matrix of data, each row is an example.
    num_clusters: an integer with the number of clusters.
    random_seed: Seed for PRNG used to initialize seeds.

  Returns:
    A Tensor with num_clusters random rows of data.
  """
  assert isinstance(data, list)
  num_data = tf.add_n([tf.shape(inp)[0] for inp in data])
  with tf.control_dependencies([tf.assert_less_equal(num_clusters, num_data)]):
    indices = tf.random_uniform([num_clusters],
                                minval=0,
                                maxval=tf.cast(num_data, tf.int64),
                                seed=random_seed,
                                dtype=tf.int64)
  indices = tf.cast(indices, tf.int32) % num_data
  clusters_init = embedding_lookup(data, indices, partition_strategy='div')
  return clusters_init
Beispiel #22
0
 def _make_runtime_assertions(
     self, distribution, reinterpreted_batch_ndims, validate_args):
   assertions = []
   static_reinterpreted_batch_ndims = tf.contrib.util.constant_value(
       reinterpreted_batch_ndims)
   batch_ndims = distribution.batch_shape.ndims
   if batch_ndims is not None and static_reinterpreted_batch_ndims is not None:
     if static_reinterpreted_batch_ndims > batch_ndims:
       raise ValueError("reinterpreted_batch_ndims({}) cannot exceed "
                        "distribution.batch_ndims({})".format(
                            static_reinterpreted_batch_ndims, batch_ndims))
   elif validate_args:
     batch_shape = distribution.batch_shape_tensor()
     batch_ndims = (
         batch_shape.shape[0].value
         if batch_shape.shape.with_rank_at_least(1)[0].value is not None else
         tf.shape(batch_shape)[0])
     assertions.append(
         tf.assert_less_equal(
             reinterpreted_batch_ndims,
             batch_ndims,
             message=("reinterpreted_batch_ndims cannot exceed "
                      "distribution.batch_ndims")))
   return assertions
Beispiel #23
0
  def replace(self, episodes, length, rows=None):
    """Replace full episodes.

    Args:
      episodes: Tuple of transition quantities with batch and time dimensions.
      length: Batch of sequence lengths.
      rows: Episodes to replace, defaults to all.

    Returns:
      Operation.
    """
    rows = tf.range(self._capacity) if rows is None else rows
    assert rows.shape.ndims == 1
    assert_capacity = tf.assert_less(
        rows, self._capacity, message='capacity exceeded')
    with tf.control_dependencies([assert_capacity]):
      assert_max_length = tf.assert_less_equal(
          length, self._max_length, message='max length exceeded')
    with tf.control_dependencies([assert_max_length]):
      replace_ops = tools.nested.map(
          lambda var, val: tf.scatter_update(var, rows, val),
          self._buffers, episodes, flatten=True)
    with tf.control_dependencies(replace_ops):
      return tf.scatter_update(self._length, rows, length)
Beispiel #24
0
def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=16,
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1):
    """Performs various post-processing on a word embedding tensor.

    Args:
      input_tensor: float Tensor of shape [batch_size, seq_length,
        embedding_size].
      use_token_type: bool. Whether to add embeddings for `token_type_ids`.
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
        Must be specified if `use_token_type` is True.
      token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
      token_type_embedding_name: string. The name of the embedding table variable
        for token type ids.
      use_position_embeddings: bool. Whether to add position embeddings for the
        position of each token in the sequence.
      position_embedding_name: string. The name of the embedding table variable
        for positional embeddings.
      initializer_range: float. Range of the weight initialization.
      max_position_embeddings: int. Maximum sequence length that might ever be
        used with this model. This can be longer than the sequence length of
        input_tensor, but cannot be shorter.
      dropout_prob: float. Dropout probability applied to the final output tensor.

    Returns:
      float tensor with same shape as `input_tensor`.

    Raises:
      ValueError: One of the tensor shapes or input values is invalid.
    """
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor

    if use_token_type:
        if token_type_ids is None:
            raise ValueError("`token_type_ids` must be specified if"
                             "`use_token_type` is True.")
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range))
        # This vocab will be small so we always do one-hot here, since it is always
        # faster for a small vocabulary.
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(flat_token_type_ids,
                                 depth=token_type_vocab_size)
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings,
                                           [batch_size, seq_length, width])
        output += token_type_embeddings

    if use_position_embeddings:
        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings, width],
                initializer=create_initializer(initializer_range))
            # Since the position embedding table is a learned variable, we create it
            # using a (long) sequence length `max_position_embeddings`. The actual
            # sequence length might be shorter than this, for faster training of
            # tasks that do not have long sequences.
            #
            # So `full_position_embeddings` is effectively an embedding table
            # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
            # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
            # perform a slice.
            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                           [seq_length, -1])
            num_dims = len(output.shape.as_list())

            # Only the last two dimensions are relevant (`seq_length` and `width`), so
            # we broadcast among the first dimensions, which is typically just
            # the batch size.
            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(1)
            position_broadcast_shape.extend([seq_length, width])
            position_embeddings = tf.reshape(position_embeddings,
                                             position_broadcast_shape)
            output += position_embeddings

    output = layer_norm_and_dropout(output, dropout_prob)
    return output
Beispiel #25
0
  def __init__(self,
               df,
               scale_operator,
               input_output_cholesky=False,
               validate_args=False,
               allow_nan_stats=True,
               name=None):
    """Construct Wishart distributions.

    Args:
      df: `float` or `double` tensor, the degrees of freedom of the
        distribution(s). `df` must be greater than or equal to `k`.
      scale_operator: `float` or `double` instance of `LinearOperator`.
      input_output_cholesky: Python `bool`. If `True`, functions whose input or
        output have the semantics of samples assume inputs are in Cholesky form
        and return outputs in Cholesky form. In particular, if this flag is
        `True`, input to `log_prob` is presumed of Cholesky form and output from
        `sample`, `mean`, and `mode` are of Cholesky form.  Setting this
        argument to `True` is purely a computational optimization and does not
        change the underlying distribution; for instance, `mean` returns the
        Cholesky of the mean, not the mean of Cholesky factors. The `variance`
        and `stddev` methods are unaffected by this flag.
        Default value: `False` (i.e., input/output does not have Cholesky
        semantics).
      validate_args: Python `bool`, default `False`. When `True` distribution
        parameters are checked for validity despite possibly degrading runtime
        performance. When `False` invalid inputs may silently render incorrect
        outputs.
      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
        result is undefined. When `False`, an exception is raised if one or
        more of the statistic's batch members are undefined.
      name: Python `str` name prefixed to Ops created by this class.

    Raises:
      TypeError: if scale is not floating-type
      TypeError: if scale.dtype != df.dtype
      ValueError: if df < k, where scale operator event shape is
        `(k, k)`
    """
    parameters = dict(locals())
    self._input_output_cholesky = input_output_cholesky
    with tf.name_scope(name) as name:
      with tf.name_scope("init", values=[df, scale_operator]):
        if not scale_operator.dtype.is_floating:
          raise TypeError(
              "scale_operator.dtype=%s is not a floating-point type" %
              scale_operator.dtype)
        if not scale_operator.is_square:
          print(scale_operator.to_dense().eval())
          raise ValueError("scale_operator must be square.")

        self._scale_operator = scale_operator
        self._df = tf.convert_to_tensor(
            df, dtype=scale_operator.dtype, name="df")
        contrib_tensor_util.assert_same_float_dtype(
            (self._df, self._scale_operator))
        if (self._scale_operator.shape.ndims is None or
            self._scale_operator.shape[-1].value is None):
          self._dimension = tf.cast(
              self._scale_operator.domain_dimension_tensor(),
              dtype=self._scale_operator.dtype,
              name="dimension")
        else:
          self._dimension = tf.convert_to_tensor(
              self._scale_operator.shape[-1].value,
              dtype=self._scale_operator.dtype,
              name="dimension")
        df_val = tensor_util.constant_value(self._df)
        dim_val = tensor_util.constant_value(self._dimension)
        if df_val is not None and dim_val is not None:
          df_val = np.asarray(df_val)
          if not df_val.shape:
            df_val = [df_val]
          if any(df_val < dim_val):
            raise ValueError(
                "Degrees of freedom (df = %s) cannot be less than "
                "dimension of scale matrix (scale.dimension = %s)"
                % (df_val, dim_val))
        elif validate_args:
          assertions = tf.assert_less_equal(
              self._dimension,
              self._df,
              message=("Degrees of freedom (df = %s) cannot be "
                       "less than dimension of scale matrix "
                       "(scale.dimension = %s)" % (self._dimension, self._df)))
          self._df = control_flow_ops.with_dependencies(
              [assertions], self._df)
    super(_WishartLinearOperator, self).__init__(
        dtype=self._scale_operator.dtype,
        validate_args=validate_args,
        allow_nan_stats=allow_nan_stats,
        reparameterization_type=tf.distributions.FULLY_REPARAMETERIZED,
        parameters=parameters,
        graph_parents=(
            [self._df, self._dimension] + self._scale_operator.graph_parents),
        name=name)
Beispiel #26
0
def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=16,
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1):
    """对词向量进行后处理

  Args:
    input_tensor: float张量,形状为[batch_size, seq_length,embedding_size],词向量
    use_token_type: bool. 是否添加token的类型向量
    token_type_ids: (可选) int32张量,形状为[batch_size, seq_length],use_token_type为True时必要有
    token_type_vocab_size: int. token类型的数量
    token_type_embedding_name: string. token的类型向量表的名字
    use_position_embeddings: bool. 是否添加位置向量
    position_embedding_name: string. 位置向量表的名字
    initializer_range: float. 初始化的范围参数
    max_position_embeddings: int. 位置向量的最大长度,只能比输入序列更长
    dropout_prob: float. 最后输出的丢弃概率

  Returns:
    跟输入维度一致的float张量

  Raises:
    ValueError: 张量形状或者输入值无效
  """
    # 获取输入张量维度,batch_size,seq_length,width(词向量的维度)
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor  # 初始化输出张量

    if use_token_type:
        # 加上token类型向量
        if token_type_ids is None:
            raise ValueError("`token_type_ids` must be specified if"
                             "`use_token_type` is True.")
        # token类型向量表变量
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range))
        # 因为类型词表总是很小,所以直接使用one-hot的方式获取向量,因为这种方式在小词表时总是更快
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(flat_token_type_ids,
                                 depth=token_type_vocab_size)
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings,
                                           [batch_size, seq_length, width])
        output += token_type_embeddings  # 直接将token类型向量加到输出上

    if use_position_embeddings:
        # 加上位置向量
        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
            # 位置向量表变量
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings, width],
                initializer=create_initializer(initializer_range))
            # full_position_embeddings已经建立了0到max_position_embeddings-1位置上的向量,
            # 为了获取0到seq_length-1位置上的向量,只要使用slice操作即可
            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                           [seq_length, -1])
            num_dims = len(output.shape.as_list())

            # 只有最后两维是相关的(`seq_length` and `width`), 所以只要广播开始的维度,通常是batch_size的维度
            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(1)
            position_broadcast_shape.extend(
                [seq_length,
                 width])  # position_broadcast_shape=[1, seq_length, width]
            position_embeddings = tf.reshape(
                position_embeddings,
                position_broadcast_shape)  # [1, seq_length, width]
            output += position_embeddings  # 直接将位置向量加到输出上

    # 先进行层标准化再dropout
    output = layer_norm_and_dropout(output, dropout_prob)
    return output
Beispiel #27
0
def transformer_model(input_tensor,
                      is_training,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=1,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      intermediate_act_fn=gelu,
                      hidden_dropout_prob=0.2,
                      attention_probs_dropout_prob=0.2,
                      initializer_range=0.02,
                      do_return_all_layers=False,
                      use_position_embeddings=True,
                      max_position_embeddings=512):
    """Multi-headed, multi-layer Transformer from "Attention is All You Need".
    This is almost an exact implementation of the original Transformer encoder.
    See the original paper:
    https://arxiv.org/abs/1706.03762
    Also see:
    https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
    Args:
      input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
      attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
        seq_length], with 1 for positions that can be attended to and 0 in
        positions that should not be.
      hidden_size: int. Hidden size of the Transformer.
      num_hidden_layers: int. Number of layers (blocks) in the Transformer.
      num_attention_heads: int. Number of attention heads in the Transformer.
      intermediate_size: int. The size of the "intermediate" (a.k.a., feed
        forward) layer.
      intermediate_act_fn: function. The non-linear activation function to apply
        to the output of the intermediate/feed-forward layer.
      hidden_dropout_prob: float. Dropout probability for the hidden layers.
      attention_probs_dropout_prob: float. Dropout probability of the attention
        probabilities.
      initializer_range: float. Range of the initializer (stddev of truncated
        normal).
      do_return_all_layers: Whether to also return all layers or just the final
        layer.
    Returns:
      float Tensor of shape [batch_size, seq_length, hidden_size], the final
      hidden layer of the Transformer.
    Raises:
      ValueError: A Tensor shape or parameter is invalid.
    """

    if hidden_size % num_attention_heads != 0:
        raise ValueError(
            "The hidden size (%d) is not a multiple of the number of attention "
            "heads (%d)" % (hidden_size, num_attention_heads))

    if not is_training:
        hidden_dropout_prob = 0.0
        attention_probs_dropout_prob = 0.0

    attention_head_size = int(hidden_size / num_attention_heads)
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    input_width = input_shape[2]

    # The Transformer performs sum residuals on all layers so the input needs
    # to be the same as the hidden size.
    if input_width != hidden_size:
        raise ValueError(
            "The width of the input tensor (%d) != hidden size (%d)" %
            (input_width, hidden_size))

    # We keep the representation as a 2D tensor to avoid re-shaping it back and
    # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
    # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
    # help the optimizer.
    if use_position_embeddings:
        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
            full_position_embeddings = tf.get_variable(
                name='position_embeddings',
                shape=[max_position_embeddings, input_width],
                initializer=create_initializer(initializer_range))
            # Since the position embedding table is a learned variable, we create it
            # using a (long) sequence length `max_position_embeddings`. The actual
            # sequence length might be shorter than this, for faster training of
            # tasks that do not have long sequences.
            #
            # So `full_position_embeddings` is effectively an embedding table
            # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
            # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
            # perform a slice.
            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                           [seq_length, -1])
            num_dims = len(input_tensor.shape.as_list())

            # Only the last two dimensions are relevant (`seq_length` and `width`), so
            # we broadcast among the first dimensions, which is typically just
            # the batch size.
            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(1)
            position_broadcast_shape.extend([seq_length, input_width])
            position_embeddings = tf.reshape(position_embeddings,
                                             position_broadcast_shape)
            input_tensor += position_embeddings

    prev_output = reshape_to_matrix(input_tensor)

    all_layer_outputs = []
    for layer_idx in range(num_hidden_layers):
        with tf.variable_scope("layer_%d" % layer_idx):
            layer_input = prev_output

            with tf.variable_scope("attention"):
                attention_heads = []
                with tf.variable_scope("self"):
                    attention_head = attention_layer(
                        from_tensor=layer_input,
                        to_tensor=layer_input,
                        attention_mask=attention_mask,
                        num_attention_heads=num_attention_heads,
                        size_per_head=attention_head_size,
                        attention_probs_dropout_prob=
                        attention_probs_dropout_prob,
                        initializer_range=initializer_range,
                        do_return_2d_tensor=True,
                        batch_size=batch_size,
                        from_seq_length=seq_length,
                        to_seq_length=seq_length)
                    attention_heads.append(attention_head)

                attention_output = None
                if len(attention_heads) == 1:
                    attention_output = attention_heads[0]
                else:
                    # In the case where we have other sequences, we just concatenate
                    # them to the self-attention head before the projection.
                    attention_output = tf.concat(attention_heads, axis=-1)

                # Run a linear projection of `hidden_size` then add a residual
                # with `layer_input`.
                with tf.variable_scope("output"):
                    attention_output = tf.layers.dense(
                        attention_output,
                        hidden_size,
                        kernel_initializer=create_initializer(
                            initializer_range))
                    attention_output = dropout(attention_output,
                                               hidden_dropout_prob)
                    attention_output = layer_norm(attention_output +
                                                  layer_input)

            # The activation is only applied to the "intermediate" hidden layer.
            with tf.variable_scope("intermediate"):
                intermediate_output = tf.layers.dense(
                    attention_output,
                    intermediate_size,
                    activation=intermediate_act_fn,
                    kernel_initializer=create_initializer(initializer_range))

            # Down-project back to `hidden_size` then add the residual.
            with tf.variable_scope("output"):
                layer_output = tf.layers.dense(
                    intermediate_output,
                    hidden_size,
                    kernel_initializer=create_initializer(initializer_range))
                layer_output = dropout(layer_output, hidden_dropout_prob)
                layer_output = layer_norm(layer_output + attention_output)
                prev_output = layer_output
                all_layer_outputs.append(layer_output)

    if do_return_all_layers:
        final_outputs = []
        for layer_output in all_layer_outputs:
            final_output = reshape_from_matrix(layer_output, input_shape)
            final_outputs.append(final_output)
        return final_outputs
    else:
        final_output = reshape_from_matrix(prev_output, input_shape)
        first_token_tensor = tf.squeeze(final_output[:, 0:1, :], axis=1)
        return first_token_tensor
def pgd_generate(x, model, eps=0.3,eps_iter=0.05, nb_iter=10, y=None, ord=np.inf, clip_min=None, clip_max=None, y_target=None, 
                 rand_init= True, rand_init_eps= 0.3, clip_grad=False, sanity_checks=True):
    """
    Generate symbolic graph for adversarial examples and return.
    :param x: The model's symbolic inputs.
    :param kwargs: See `parse_params`
    """  
    
    asserts = []

    # If a data range was specified, check that the input was in that range
    if clip_min is not None:
      asserts.append(tf.assert_greater_equal(x,
                                             tf.cast(clip_min,
                                                           x.dtype)))
    if clip_max is not None:
      asserts.append(tf.assert_less_equal(x,
                                          tf.cast(clip_max,
                                                        x.dtype)))
    # Initialize loop variables
    if rand_init:
      eta = random_lp_vector(tf.shape(x), ord,
                             tf.cast(rand_init_eps, x.dtype),
                             dtype=x.dtype)
    else:
      eta = tf.zeros(tf.shape(x))

    # Clip eta
    eta = clip_eta(eta, ord, eps)
    adv_x = x + eta
    if clip_min is not None or clip_max is not None:
      adv_x = clip_by_value(adv_x, clip_min, clip_max)

    if y_target is not None:
      y = y_target
      targeted = True
    elif y is not None:
      y = y
      targeted = False
    else:
      model_preds = model(x)
      preds_max = tf.reduce_max(model_preds, 1, keepdims=True)
      y = tf.to_float(tf.equal(model_preds, preds_max))
      y = tf.stop_gradient(y)
      targeted = False
      del model_preds

#    def cond(i, _):
#      """Iterate until requested number of iterations is completed"""
#      return tf.less(i, nb_iter)
#
#    def body(i, adv_x):
#      """Do a projected gradient step"""
#      adv_x = fgsm_generate(adv_x, model, y=y, eps=eps,  ord=ord, clip_min=clip_min, clip_max=clip_max, 
#                            clip_grad=clip_grad, targeted=targeted, sanity_checks=True)
#     
#      # Clipping perturbation eta to ord norm ball
#      eta = adv_x - x
#      eta = clip_eta(eta, ord, eps)
#      adv_x = x + eta
#
#      # Redo the clipping.
#      # FGM already did it, but subtracting and re-adding eta can add some
#      # small numerical error.
#      if clip_min is not None or clip_max is not None:
#        adv_x = utils_tf.clip_by_value(adv_x, clip_min, clip_max)
#
#      return i + 1, adv_x
#
#    _, adv_x = tf.while_loop(cond, body, (tf.zeros([]), adv_x), back_prop=True,
#                             maximum_iterations=nb_iter)

    for i in range(nb_iter):

        adv_x = fgsm_generate(adv_x, model, y=y, eps=eps_iter,  ord=ord, clip_min=clip_min, clip_max=clip_max, 
                              clip_grad=clip_grad, targeted=targeted, sanity_checks=True)
        #Clipping perturbation eta to ord norm ball
        eta = adv_x - x
        eta = clip_eta(eta, ord, eps)
        adv_x = x + eta
        # Redo the clipping.
        # FGM already did it, but subtracting and re-adding eta can add some
        # small numerical error.
        if clip_min is not None or clip_max is not None:
            adv_x = clip_by_value(adv_x, clip_min, clip_max)
        
    common_dtype = tf.float32
    asserts.append(tf.assert_less_equal(tf.cast(eps_iter, dtype=common_dtype), tf.cast(eps, dtype=common_dtype)))
    if ord == np.inf and clip_min is not None:
      asserts.append(tf.assert_less_equal(tf.cast(eps, x.dtype), 1e-6 + tf.cast(clip_max, x.dtype) - tf.cast(clip_min, x.dtype)))
    if sanity_checks:
      with tf.control_dependencies(asserts):
        adv_x = tf.identity(adv_x)

    return adv_x
Beispiel #29
0
    def model_fn(features, labels, mode, params):
        is_training = mode == tf.estimator.ModeKeys.TRAIN
        # Inputs
        tokens = features[TEXT]  # (N, L)
        token_lengths = features[SENTENCE_LENGTH]  # (N,)
        sequence_mask = tf.sequence_mask(maxlen=tf.shape(tokens)[1],
                                         lengths=token_lengths)
        n = tf.shape(tokens)[0]
        length = params.flat_length

        with tf.control_dependencies([
                tf.assert_greater_equal(
                    length,
                    token_lengths,
                    message="Tokens longer than flat_length"),
                tf.assert_less_equal(tokens,
                                     tf.cast(vocab_size - 1,
                                             dtype=tokens.dtype),
                                     message="Tokens larger than vocab"),
                tf.assert_greater_equal(tokens,
                                        tf.cast(0, dtype=tokens.dtype),
                                        message="Tokens less than 0")
        ]):
            tokens = tf.identity(tokens)

        if params.l2 > 0:
            weights_regularizer = slim.l2_regularizer(params.l2)
        else:
            weights_regularizer = None

        with tf.variable_scope('autoencoder') as autoencoder_scope:
            # Encoder
            with tf.variable_scope('encoder'):
                mu, logsigma = encoder_flat(
                    tokens=tokens,
                    token_lengths=token_lengths,
                    vocab_size=vocab_size,
                    params=params,
                    n=n,
                    weights_regularizer=weights_regularizer)
            # Sampling
            latent_sample, latent_prior_sample = sampling_flat(
                mu=mu, logsigma=logsigma, params=params, n=n)

            # Decoder
            with tf.variable_scope('decoder', reuse=False) as decoder_scope:
                logits = decoder_flat(latent=latent_sample,
                                      vocab_size=vocab_size,
                                      params=params,
                                      weights_regularizer=weights_regularizer,
                                      n=n)
            if params.model_mode == ModelModes.AE:
                glogits = None
            else:
                with tf.variable_scope(decoder_scope, reuse=True):
                    glogits = decoder_flat(
                        latent=latent_prior_sample,
                        vocab_size=vocab_size,
                        params=params,
                        weights_regularizer=weights_regularizer,
                        n=n)

        if params.model_mode == ModelModes.AAE_RE or params.model_mode == ModelModes.AAE_STOCH:
            with tf.variable_scope('discriminator') as discriminator_scope:
                dis_inputs = tf.concat([latent_prior_sample, latent_sample],
                                       axis=0)
                dis_out = discriminator_output(
                    x=dis_inputs,
                    params=params,
                    weights_regularizer=weights_regularizer,
                    is_training=is_training)
                dis_out = tf.squeeze(dis_out, -1)
                print("Dis: {} -> {}".format(dis_inputs, dis_out))
            build_gan_losses(params=params,
                             autoencoder_scope=autoencoder_scope.name,
                             discriminator_scope=discriminator_scope.name,
                             dis_out=dis_out,
                             n=n)
            discriminator_hook = dis_train_hook(
                discriminator_scope=discriminator_scope.name, params=params)
            training_hooks = [discriminator_hook]
        elif params.model_mode == ModelModes.VAE:
            training_hooks = []
        elif params.model_mode == ModelModes.AE:
            training_hooks = []
        else:
            raise ValueError()
        sequence_length_ctc = tf.tile([length], (n, ))

        return ctc_estimator(tokens=tokens,
                             token_lengths=token_lengths,
                             logits=logits,
                             glogits=glogits,
                             sequence_mask=sequence_mask,
                             sequence_length_ctc=sequence_length_ctc,
                             vocab=vocab,
                             run_config=run_config,
                             params=params,
                             model_scope=autoencoder_scope.name,
                             training_hooks=training_hooks,
                             mode=mode)
Beispiel #30
0
 def one_hots(offsets, name='one_hots'):
     with tf.name_scope(name) as scope:
         with tf.control_dependencies([tf.assert_less_equal(tf.abs(offsets), scale)]):
             result = tf.expand_dims(tf.one_hot(scale - offsets, kernel_size), 1, name=scope)
             assert_shape(result, [batch_size, 1, kernel_size])
             return result
Beispiel #31
0
def bert_encoder(sequence, params):

    # extract sequence mask information
    seq_mask = 1. - tf.to_float(tf.equal(sequence, params.bert.vocab.pad))

    # extract segment information
    seg_pos = tf.to_float(tf.equal(sequence, params.bert.vocab.sep))
    seg_ids = tf.cumsum(seg_pos, axis=1, reverse=True)
    seg_num = tf.reduce_sum(seg_pos, axis=1, keepdims=True)
    seg_ids = seg_num - seg_ids
    seg_ids = tf.to_int32(seg_ids * seq_mask)

    # sequence length information
    seq_shp = util.shape_list(sequence)
    batch_size, seq_length = seq_shp[:2]

    def custom_getter(getter, name, *args, **kwargs):
        kwargs['trainable'] = params.tune_bert
        return getter(name, *args, **kwargs)

    with tf.variable_scope("bert", custom_getter=custom_getter):

        # handling sequence embeddings: token_embedding pls segment embedding pls positional embedding
        embed_initializer = tf.truncated_normal_initializer(stddev=params.bert.initializer_range)
        with tf.variable_scope("embeddings"):
            word_embedding = tf.get_variable(
                name="word_embeddings",
                shape=[params.bert.vocab.size, params.bert.hidden_size],
                initializer=embed_initializer
            )
            seq_embed = tf.nn.embedding_lookup(word_embedding, sequence)

            segment_embedding = tf.get_variable(
                name="token_type_embeddings",
                shape=[2, params.bert.hidden_size],
                initializer=embed_initializer
            )
            seg_embed = tf.nn.embedding_lookup(segment_embedding, seg_ids)

            # word embedding + segment embedding
            seq_embed = seq_embed + seg_embed

            # add position embedding
            assert_op = tf.assert_less_equal(seq_length, params.bert.max_position_embeddings)
            with tf.control_dependencies([assert_op]):
                position_embedding = tf.get_variable(
                    name="position_embeddings",
                    shape=[params.bert.max_position_embeddings, params.bert.hidden_size],
                    initializer=embed_initializer
                )
                pos_embed = position_embedding[:seq_length]

                seq_embed = seq_embed + tf.expand_dims(pos_embed, 0)

            # post-processing, layer norm and segmentation
            seq_embed = tc.layers.layer_norm(
                inputs=seq_embed, begin_norm_axis=-1, begin_params_axis=-1)

            seq_embed = util.valid_apply_dropout(seq_embed, params.bert.hidden_dropout_prob)

        bert_outputs = []

        #  handling sequence encoding with transformer encoder
        with tf.variable_scope("encoder"):
            attention_mask = encoder.create_attention_mask_from_input_mask(
                sequence, seq_mask)

            # Run the stacked transformer.
            # `sequence_output` shape = [batch_size, seq_length, hidden_size].
            all_encoder_layers = encoder.transformer_model(
                input_tensor=seq_embed,
                attention_mask=attention_mask,
                hidden_size=params.bert.hidden_size,
                num_hidden_layers=params.bert.num_hidden_layers,
                num_attention_heads=params.bert.num_attention_heads,
                intermediate_size=params.bert.intermediate_size,
                intermediate_act_fn=encoder.get_activation(params.bert.hidden_act),
                hidden_dropout_prob=params.bert.hidden_dropout_prob,
                attention_probs_dropout_prob=params.bert.attention_probs_dropout_prob,
                initializer_range=params.bert.initializer_range,
                do_return_all_layers=True)

        sequence_output = all_encoder_layers

        bert_outputs.append(sequence_output)

        if params.use_bert_single:
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(sequence_output[-1][:, 0:1, :], axis=1)
                pooled_output = tf.layers.dense(
                    first_token_tensor,
                    params.bert.hidden_size,
                    activation=tf.tanh,
                    kernel_initializer=embed_initializer)

                bert_outputs.append(pooled_output)

        return bert_outputs
Beispiel #32
0
    def __init__(self,
                 mean_direction,
                 concentration,
                 validate_args=False,
                 allow_nan_stats=True,
                 name='VonMisesFisher'):
        """Creates a new `VonMisesFisher` instance.

    Args:
      mean_direction: Floating-point `Tensor` with shape [B1, ... Bn, D].
        A unit vector indicating the mode of the distribution, or the
        unit-normalized direction of the mean. (This is *not* in general the
        mean of the distribution; the mean is not generally in the support of
        the distribution.) NOTE: `D` is currently restricted to <= 5.
      concentration: Floating-point `Tensor` having batch shape [B1, ... Bn]
        broadcastable with `mean_direction`. The level of concentration of
        samples around the `mean_direction`. `concentration=0` indicates a
        uniform distribution over the unit hypersphere, and `concentration=+inf`
        indicates a `Deterministic` distribution (delta function) at
        `mean_direction`.
      validate_args: Python `bool`, default `False`. When `True` distribution
        parameters are checked for validity despite possibly degrading runtime
        performance. When `False` invalid inputs may silently render incorrect
        outputs.
      allow_nan_stats: Python `bool`, default `True`. When `True`,
        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
        indicate the result is undefined. When `False`, an exception is raised
        if one or more of the statistic's batch members are undefined.
      name: Python `str` name prefixed to Ops created by this class.

    Raises:
      ValueError: For known-bad arguments, i.e. unsupported event dimension.
    """
        parameters = dict(locals())
        with tf.name_scope(name, values=[mean_direction,
                                         concentration]) as name:
            assertions = [
                tf.assert_non_negative(
                    concentration,
                    message='`concentration` must be non-negative'),
                tf.assert_greater(
                    tf.shape(mean_direction)[-1],
                    1,
                    message='`mean_direction` may not have scalar event shape'
                ),
                tf.assert_near(1.,
                               tf.linalg.norm(mean_direction, axis=-1),
                               message='`mean_direction` must be unit-length')
            ] if validate_args else []
            if mean_direction.shape.with_rank_at_least(
                    1)[-1].value is not None:
                if mean_direction.shape.with_rank_at_least(1)[-1].value > 5:
                    raise ValueError(
                        'vMF ndims > 5 is not currently supported')
            elif validate_args:
                assertions += [
                    tf.assert_less_equal(
                        tf.shape(mean_direction)[-1],
                        5,
                        message='vMF ndims > 5 is not currently supported')
                ]
            with tf.control_dependencies(assertions):
                self._mean_direction = tf.convert_to_tensor(
                    mean_direction, name='mean_direction')
                self._concentration = tf.convert_to_tensor(
                    concentration, name='concentration')
                tf.assert_same_float_dtype(
                    [self._mean_direction, self._concentration])
            # mean_direction is always reparameterized.
            # concentration is only for event_dim==3, via an inversion sampler.
            reparameterization_type = (
                tf.distributions.FULLY_REPARAMETERIZED
                if mean_direction.shape.with_rank_at_least(1)[-1].value == 3
                else tf.distributions.NOT_REPARAMETERIZED)
            super(VonMisesFisher, self).__init__(
                dtype=self._concentration.dtype,
                validate_args=validate_args,
                allow_nan_stats=allow_nan_stats,
                reparameterization_type=reparameterization_type,
                parameters=parameters,
                graph_parents=[self._mean_direction, self._concentration],
                name=name)
Beispiel #33
0
def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=16,
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1):
    """Performs various post-processing on a word embedding tensor.

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length,
      embedding_size].
    use_token_type: bool. Whether to add embeddings for `token_type_ids`.
    token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      Must be specified if `use_token_type` is True.
    token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
    token_type_embedding_name: string. The name of the embedding table variable
      for token type ids.
    use_position_embeddings: bool. Whether to add position embeddings for the
      position of each token in the sequence.
    position_embedding_name: string. The name of the embedding table variable
      for positional embeddings.
    initializer_range: float. Range of the weight initialization.
    max_position_embeddings: int. Maximum sequence length that might ever be
      used with this model. This can be longer than the sequence length of
      input_tensor, but cannot be shorter.
    dropout_prob: float. Dropout probability applied to the final output tensor.

  Returns:
    float tensor with same shape as `input_tensor`.

  Raises:
    ValueError: One of the tensor shapes or input values is invalid.
  """
    #此时input_tensor是三维,[batch_size, seq_length, embdding_size]
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor

    #如果使用token_type,先创建token的embedding table, 维度是[token_type_vocab_size, width]
    if use_token_type:
        if token_type_ids is None:
            raise ValueError("`token_type_ids` must be specified if"
                             "`use_token_type` is True.")
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range))
        # This vocab will be small so we always do one-hot here, since it is always
        # faster for a small vocabulary.
        #token embdding直接使用时one-hot方式进行查找其对应的embedding向量,token是指一句话可以分成几段,在分类任务中,token type是0
        #在句子对任务中, token type是2,默认token_type最大是16
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(flat_token_type_ids,
                                 depth=token_type_vocab_size)
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings,
                                           [batch_size, seq_length, width])
        #将token embedding的结果加在原来的embedding结果中
        output += token_type_embeddings

    if use_position_embeddings:
        #确保长度不超过最大长度
        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        #不理解tf.control_dependencies的作用
        with tf.control_dependencies([assert_op]):
            #创建位置编码的embedding table, [max_position_embeddings, width]
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings, width],
                initializer=create_initializer(initializer_range))
            # Since the position embedding table is a learned variable, we create it
            # using a (long) sequence length `max_position_embeddings`. The actual
            # sequence length might be shorter than this, for faster training of
            # tasks that do not have long sequences.
            #
            # So `full_position_embeddings` is effectively an embedding table
            # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
            # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
            # perform a slice.
            #虽然在上一步,创建了最大位置的embedding table,但在实际中,输入的最大序列长度不会超过bert润许的最大长度,为了高效计算,、
            #我们从初始化的全位置词表中,将最大输入序列长度的位置取出来,即此时的position embedding维度是[seq_length, width]
            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                           [seq_length, -1])
            num_dims = len(output.shape.as_list())

            # Only the last two dimensions are relevant (`seq_length` and `width`), so
            # we broadcast among the first dimensions, which is typically just
            # the batch size.
            #在输入中,最后两维是seq_legth和width
            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(
                    1)  #除了后两维,前面的维度现保存起来,此时position_broadcast_shape存的是1
            position_broadcast_shape.extend([seq_length,
                                             width])  #[1, seq_length, width]
            position_embeddings = tf.reshape(position_embeddings,
                                             position_broadcast_shape)
            #batch_size中的每一个[seq_length, width]都加上positon_embedding,此时的position_embedding是同一个
            output += position_embeddings

    #运用layer norm和dropout
    output = layer_norm_and_dropout(output, dropout_prob)
    return output
Beispiel #34
0
def pix2pix_preprocess(images,
                       num_preprocessing_layers=0,
                       num_outputs=3,
                       encoder_base_num_filters=32,
                       is_training=True,
                       reuse=False,
                       is_chief=True,
                       num_rows=5,
                       batch_size=32,
                       verbose=True,
                       **kwargs):
    """Free-form transformation preprocessing.
    
    Args:
        x: 4D tensor of images
        num_preprocessing_layers: I negative, number of layers in the pix2pix encoder. 
        num_outputs: Number of output channels for Pix2Pix
        encoder_base_num_filters: Base number of filters in the pix2pix encoder
        num_rows: required for summary (if is_chief is True)
        batch_size: required for summary (if is_chief is True)
        is_training: whether the model is in training mode
        reuse: whether to reuse the model.
        is_chief: determine whether to add summaries
        verbose: verbosity level
        kwargs: Unused keywords arguments
    """
    assert num_preprocessing_layers <= 0

    # No preprocessing
    if num_preprocessing_layers == 0:
        return images
    # Pix2pix
    else:
        # Pix2Pix take images in [-1, 1]
        with tf.control_dependencies([tf.assert_greater_equal(images, 0.)]):
            with tf.control_dependencies([tf.assert_less_equal(images, 1.)]):
                images = (images - 0.5) * 2
            if is_chief:
                input_images = viz_utils.image_grid(images,
                                                    num_rows=num_rows,
                                                    batch_size=batch_size)

        # Pix2pix. Output in [-1, 1]
        encoder_blocks = [
            encoder_base_num_filters * (2**i)
            for i in range(-num_preprocessing_layers)
        ]
        images = net_utils.pix2pix(images,
                                   encoder_blocks=encoder_blocks,
                                   num_outputs=num_outputs,
                                   is_training=is_training,
                                   reuse=reuse,
                                   is_chief=is_chief,
                                   verbose=verbose,
                                   **kwargs)
        # Image summaries
        if is_chief:
            images = tf.identity(images, name='projected_images')
            mode = ('train' if is_training else 'test')
            output_images = viz_utils.image_grid(images,
                                                 num_rows=num_rows,
                                                 batch_size=batch_size)
            # Tile if necessary
            if num_outputs == 1 and input_images.get_shape()[-1] == 3:
                output_images = tf.tile(output_images, (1, 1, 1, 3))
            if input_images.get_shape()[-1] == 1 and num_outputs == 3:
                input_images = tf.tile(input_images, (1, 1, 1, 3))
            # Grid
            summary_images = viz_utils.image_grid(tf.concat(
                [input_images, output_images], axis=0),
                                                  num_rows=1,
                                                  num_cols=2,
                                                  batch_size=2)
            summary_images = tf.identity(summary_images, name='in_out_images')
            tf.summary.image('%s/in_out' % mode,
                             summary_images,
                             collections=[mode])

        # Send output back to [0, 1]
        images = images / 2. + 0.5
        return images
Beispiel #35
0
    def __init__(self,
                 learning_rate,
                 preconditioner_decay_rate=0.95,
                 num_pseudo_batches=1,
                 burnin=25,
                 diagonal_bias=1e-8,
                 name=None,
                 variable_scope=None):
        default_name = 'StochasticGradientLangevinDynamics'
        with tf.name_scope(name, default_name, [
                learning_rate, preconditioner_decay_rate, num_pseudo_batches,
                burnin, diagonal_bias
        ]):
            if variable_scope is None:
                var_scope_name = tf.get_default_graph().unique_name(
                    name or default_name)
                with tf.variable_scope(var_scope_name) as scope:
                    self._variable_scope = scope
            else:
                self._variable_scope = variable_scope

            self._preconditioner_decay_rate = tf.convert_to_tensor(
                preconditioner_decay_rate, name='preconditioner_decay_rate')
            self._num_pseudo_batches = tf.convert_to_tensor(
                num_pseudo_batches, name='num_pseudo_batches')
            self._burnin = tf.convert_to_tensor(burnin, name='burnin')
            self._diagonal_bias = tf.convert_to_tensor(diagonal_bias,
                                                       name='diagonal_bias')
            self._learning_rate = tf.convert_to_tensor(learning_rate,
                                                       name='learning_rate')

            with tf.variable_scope(self._variable_scope):
                self._counter = tf.get_variable('counter',
                                                initializer=0,
                                                trainable=False)

            self._preconditioner_decay_rate = control_flow_ops.with_dependencies([
                tf.assert_non_negative(
                    self._preconditioner_decay_rate,
                    message='`preconditioner_decay_rate` must be non-negative'
                ),
                tf.assert_less_equal(
                    self._preconditioner_decay_rate,
                    1.,
                    message='`preconditioner_decay_rate` must be at most 1.'),
            ], self._preconditioner_decay_rate)

            self._num_pseudo_batches = control_flow_ops.with_dependencies([
                tf.assert_greater(
                    self._num_pseudo_batches,
                    0,
                    message='`num_pseudo_batches` must be greater than zero')
            ], self._num_pseudo_batches)

            self._burnin = control_flow_ops.with_dependencies([
                tf.assert_non_negative(
                    self._burnin, message='`burnin` must be non-negative'),
                tf.assert_integer(self._burnin,
                                  message='`burnin` must be an integer')
            ], self._burnin)

            self._diagonal_bias = control_flow_ops.with_dependencies([
                tf.assert_non_negative(
                    self._diagonal_bias,
                    message='`diagonal_bias` must be non-negative')
            ], self._diagonal_bias)

            super(StochasticGradientLangevinDynamics,
                  self).__init__(use_locking=False, name=name or default_name)
Beispiel #36
0
    def generate(self, x, **kwargs):
        """
        Generate symbolic graph for adversarial examples and return.

        :param x: The model's symbolic inputs.
        :param eps: (optional float) maximum distortion of adversarial example
                    compared to original input
        :param eps_iter: (optional float) step size for each attack iteration
        :param nb_iter: (optional int) Number of attack iterations.
        :param rand_init: (optional) Whether to use random initialization
        :param y: (optional) A tensor with the true class labels
            NOTE: do not use smoothed labels here
        :param y_target: (optional) A tensor with the labels to target. Leave
                            y_target=None if y is also set. Labels should be
                            one-hot-encoded.
            NOTE: do not use smoothed labels here
        :param ord: (optional) Order of the norm (mimics Numpy).
                    Possible values: np.inf, 1 or 2.
        :param clip_min: (optional float) Minimum input component value
        :param clip_max: (optional float) Maximum input component value
        """
        # Parse and save attack-specific parameters
        assert self.parse_params(**kwargs)

        # Initialize loop variables
        if self.rand_init:
            eta = tf.random_uniform(tf.shape(x),
                                    -self.rand_minmax,
                                    self.rand_minmax,
                                    dtype=self.tf_dtype)
        else:
            eta = tf.zeros(tf.shape(x))
        eta = clip_eta(eta, self.ord, self.eps)

        # Fix labels to the first model predictions for loss computation
        model_preds = self.model.get_output(x)
        preds_max = reduce_max(model_preds, 1, keepdims=True)
        if self.y_target is not None:
            y = self.y_target
            targeted = True
        elif self.y is not None:
            y = self.y
            targeted = False
        else:
            y = tf.to_float(tf.equal(model_preds, preds_max))
            y = tf.stop_gradient(y)
            targeted = False

        y_kwarg = 'y_target' if targeted else 'y'
        fgm_params = {
            'eps': self.eps_iter,
            y_kwarg: y,
            'ord': self.ord,
            'clip_min': self.clip_min,
            'clip_max': self.clip_max
        }

        # Use getattr() to avoid errors in eager execution attacks
        FGM = self.FGM_CLASS(self.model,
                             sess=getattr(self, 'sess', None),
                             dtypestr=self.dtypestr)

        def cond(i, _):
            return tf.less(i, self.nb_iter)

        def body(i, e):
            adv_x = FGM.generate(x + e, **fgm_params)

            # Clipping perturbation according to clip_min and clip_max
            if self.clip_min is not None and self.clip_max is not None:
                adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max)

            # Clipping perturbation eta to self.ord norm ball
            eta = adv_x - x
            eta = clip_eta(eta, self.ord, self.eps)
            return i + 1, eta

        _, eta = tf.while_loop(cond, body, [tf.zeros([]), eta], back_prop=True)

        # Define adversarial example (and clip if necessary)
        adv_x = x + eta
        if self.clip_min is not None or self.clip_max is not None:
            assert self.clip_min is not None and self.clip_max is not None
            adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max)

        asserts = []

        # Asserts run only on CPU.
        # When multi-GPU eval code tries to force all PGD ops onto GPU, this
        # can cause an error.
        with tf.device("/CPU:0"):
            asserts.append(tf.assert_less_equal(self.eps_iter, self.eps))
            if self.ord == np.inf and self.clip_min is not None:
                # The 1e-6 is needed to compensate for numerical error.
                # Without the 1e-6 this fails when e.g. eps=.2, clip_min=.5, clip_max=.7
                asserts.append(
                    tf.assert_less_equal(self.eps,
                                         1e-6 + self.clip_max - self.clip_min))

        if self.sanity_checks:
            with tf.control_dependencies(asserts):
                adv_x = tf.identity(adv_x)

        return adv_x
Beispiel #37
0
def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=3,
                            token_type_embedding_name='token_type_embeddings',
                            use_positional_embeddings=True,
                            positional_embedding_type='normal',
                            pre_positional_embeddings=None,
                            positional_embedding_name='position_embeddings',
                            initializer_range=0.01,
                            max_positional_embeddings=512,
                            dropout_prob=0.01):
    """Performs some preprocessing on the word embeddings.
    
    Args:
        input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size].
        use_token_type: bool. Whether to add segment embeddings, very confused about the original comments
            uses 'token' as name, as I realized, token_type_ids would be [[0, 0, 1], [0, 1, 0]], 0 refers to the segment 1,
            and 1 refers to segment 2, the last 0 in the second array refers to the padding.
        token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
        token_type_vocab_size: the number of token types.
        use_positional_embeddings: bool. Whether to add positional embeddings.
        positional_embedding_type: ['normal', 'trigonometrical'].
        pre_positional_embeddings: postional embeddings for the pre_positional_embeddings.
        postional_embedding_name: string. The name of the embedding table variable.
        initializer_range: float. Range of the weight initializer.
        max_positional_embeddings: int. Maximum sequence length for each sentence, which should be equal to or longer than the sequence.
        dropout_prob: float. Dropout probability applied to the final output tensor.
    
    Returns:
        float Tensor with the identical shape as 'input_tensor'.
    """
    input_shape = get_shape_list(input_tensor, expected_rank=[2,3])
    batch_size, seq_length, width = input_shape[0], input_shape[1], input_shape[2]

    # create this variable in case of not use any pre-embeddings on the input_tensor
    output = input_tensor

    if use_token_type:
        if token_type_ids is None:
            _error('`token_type_ids` must be specified if `use_token_type` is True.')
            raise ValueError
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range))
        
        token_type_embeddings = tf.nn.embedding_lookup(token_type_table, token_type_ids)
        output += token_type_embeddings

    if use_positional_embeddings:
        assert_op = tf.assert_less_equal(seq_length, max_positional_embeddings)
        with tf.control_dependencies([assert_op]):
            full_positional_embeddings = tf.get_variable(
                name=positional_embedding_name,
                shape=[max_positional_embeddings, width],
                initializer=create_initializer(initializer_range))
            
            # the full_positional_embeddings is created under the maximum sequence length,
            # however, the actual length maybe less than the maximum length, so slicing is necessary.
            positional_embeddings = tf.slice(full_positional_embeddings, [0, 0], [seq_length, -1])
            output += positional_embeddings
    
    output = layer_norm_and_dropout(output, dropout_prob)
    return output
Beispiel #38
0
def embedding_postprocessor(
    input_tensor,
    use_token_type=False,
    token_type_ids=None,
    token_type_vocab_size=2,
    token_type_embedding_name='token_type_embeddings',
    use_position_embeddings=True,
    position_embedding_name='position_embeddings',
    initializer_range=0.02,
    max_position_embeddings=512,
):
    """Performs various post-processing on a word embedding tensor.
  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length,
      embedding_size].
    use_token_type: bool. Whether to add embeddings for `token_type_ids`.
    token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      Must be specified if `use_token_type` is True.
    token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
    token_type_embedding_name: string. The name of the embedding table variable
      for token type ids.
    use_position_embeddings: bool. Whether to add position embeddings for the
      position of each token in the sequence.
    position_embedding_name: string. The name of the embedding table variable
      for positional embeddings.
    initializer_range: float. Range of the weight initialization.
    max_position_embeddings: int. Maximum sequence length that might ever be
      used with this model. This can be longer than the sequence length of
      input_tensor, but cannot be shorter.
    dropout_prob: float. Dropout probability applied to the final output tensor.
  Returns:
    float tensor with same shape as `input_tensor`.
  Raises:
    ValueError: One of the tensor shapes or input values is invalid.
  """
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor

    if use_token_type:
        if token_type_ids is None:
            raise ValueError(
                '`token_type_ids` must be specified if'
                '`use_token_type` is True.'
            )
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range),
        )
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(
            flat_token_type_ids, depth=token_type_vocab_size
        )
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(
            token_type_embeddings, [batch_size, seq_length, width]
        )
        output += token_type_embeddings

    if use_position_embeddings:
        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings, width],
                initializer=create_initializer(initializer_range),
            )
            position_embeddings = tf.slice(
                full_position_embeddings, [0, 0], [seq_length, -1]
            )
            num_dims = len(output.shape.as_list())
            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(1)
            position_broadcast_shape.extend([seq_length, width])
            position_embeddings = tf.reshape(
                position_embeddings, position_broadcast_shape
            )
            output += position_embeddings

    return output
Beispiel #39
0
    def _sample_n(self, n, seed=None):
        seed = seed_stream.SeedStream(seed, salt='vom_mises_fisher')
        # The sampling strategy relies on the fact that vMF variates are symmetric
        # about the mean direction. Accordingly, if we have a sampling strategy for
        # the away-from-mean angle, then we can uniformly sample the remaining
        # dimensions on the S^{dim-2} sphere for , and rotate these samples from a
        # (1, 0, 0, ..., 0)-mode distribution into the target orientation.
        #
        # This is easy to imagine on the 1-sphere (S^1; in 2-D space): sample a
        # von-Mises distributed `x` value in [-1, 1], then uniformly select what
        # amounts to a "up" or "down" additional degree of freedom after unit
        # normalizing, followed by a final rotation to the desired mean direction
        # from a basis of (1, 0).
        #
        # On S^2 (in 3-D), selecting a vMF `x` identifies a circle in `yz` on the
        # unit sphere over which the distribution is uniform, in particular the
        # circle where x = \hat{x} intersects the unit sphere. We pick a point on
        # that circle, then rotate to the desired mean direction from a basis of
        # (1, 0, 0).
        event_dim = self.event_shape[0].value or self._event_shape_tensor()[0]

        sample_batch_shape = tf.concat([[n], self._batch_shape_tensor()],
                                       axis=0)
        dim = tf.cast(event_dim - 1, self.dtype)
        if event_dim == 3:
            samples_dim0 = self._sample_3d(n, seed=seed)
        else:
            # Wood'94 provides a rejection algorithm to sample the x coordinate.
            # Wood'94 definition of b:
            # b = (-2 * kappa + tf.sqrt(4 * kappa**2 + dim**2)) / dim
            # https://stats.stackexchange.com/questions/156729 suggests:
            b = dim / (2 * self.concentration +
                       tf.sqrt(4 * self.concentration**2 + dim**2))
            # TODO(bjp): Integrate any useful numerical tricks from hyperspherical VAE
            #     https://github.com/nicola-decao/s-vae-tf/
            x = (1 - b) / (1 + b)
            c = self.concentration * x + dim * tf.log1p(-x**2)
            beta = tf.distributions.Beta(dim / 2, dim / 2)

            def cond_fn(w, should_continue):
                del w
                return tf.reduce_any(should_continue)

            def body_fn(w, should_continue):
                z = beta.sample(sample_shape=sample_batch_shape, seed=seed())
                w = tf.where(should_continue,
                             (1 - (1 + b) * z) / (1 - (1 - b) * z), w)
                w = tf.check_numerics(w, 'w')
                should_continue = tf.logical_and(
                    should_continue,
                    self.concentration * w + dim * tf.log1p(-x * w) - c <
                    tf.log(
                        tf.random_uniform(sample_batch_shape,
                                          seed=seed(),
                                          dtype=self.dtype)))
                return w, should_continue

            w = tf.zeros(sample_batch_shape, dtype=self.dtype)
            should_continue = tf.ones(sample_batch_shape, dtype=tf.bool)
            samples_dim0 = tf.while_loop(cond_fn, body_fn,
                                         (w, should_continue))[0]
            samples_dim0 = samples_dim0[..., tf.newaxis]
        if not self._allow_nan_stats:
            # Verify samples are w/in -1, 1, with useful error output tensors (top
            # value rather than all values).
            with tf.control_dependencies([
                    tf.assert_less_equal(
                        samples_dim0,
                        self.dtype.as_numpy_dtype(1.01),
                        data=[tf.nn.top_k(tf.reshape(samples_dim0, [-1]))[0]]),
                    tf.assert_greater_equal(
                        samples_dim0,
                        self.dtype.as_numpy_dtype(-1.01),
                        data=[
                            -tf.nn.top_k(tf.reshape(-samples_dim0, [-1]))[0]
                        ])
            ]):
                samples_dim0 = tf.identity(samples_dim0)
        samples_otherdims_shape = tf.concat(
            [sample_batch_shape, [event_dim - 1]], axis=0)
        unit_otherdims = tf.nn.l2_normalize(tf.random_normal(
            samples_otherdims_shape, seed=seed(), dtype=self.dtype),
                                            axis=-1)
        samples = tf.concat(
            [
                samples_dim0,  # we must avoid sqrt(1 - (>1)**2)
                tf.sqrt(tf.maximum(1 - samples_dim0**2, 0.)) * unit_otherdims
            ],
            axis=-1)
        samples = tf.nn.l2_normalize(samples, axis=-1)
        if not self._allow_nan_stats:
            samples = tf.check_numerics(samples, 'samples')

        # Runtime assert that samples are unit length.
        if not self._allow_nan_stats:
            worst, idx = tf.nn.top_k(
                tf.reshape(tf.abs(1 - tf.linalg.norm(samples, axis=-1)), [-1]))
            with tf.control_dependencies([
                    tf.assert_near(self.dtype.as_numpy_dtype(0),
                                   worst,
                                   data=[
                                       worst, idx,
                                       tf.gather(
                                           tf.reshape(samples,
                                                      [-1, event_dim]), idx)
                                   ],
                                   atol=1e-4,
                                   summarize=100)
            ]):
                samples = tf.identity(samples)
        # The samples generated are symmetric around a mode at (1, 0, 0, ...., 0).
        # Now, we move the mode to `self.mean_direction` using a rotation matrix.
        if not self._allow_nan_stats:
            # Assert that the basis vector rotates to the mean direction, as expected.
            basis = tf.cast(
                tf.concat([[1.], tf.zeros([event_dim - 1])], axis=0),
                self.dtype)
            with tf.control_dependencies([
                    tf.assert_less(
                        tf.linalg.norm(self._rotate(basis) -
                                       self.mean_direction,
                                       axis=-1),
                        self.dtype.as_numpy_dtype(1e-5))
            ]):
                return self._rotate(samples)
        return self._rotate(samples)
Beispiel #40
0
def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=16,
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1):
    # 对一个word embedding执行各种后处理;

    # Args:
    #   input_tensor: float类型的Tensor;
    #   use_token_type: bool类型, 是否对`token_type_ids`添加embeddings;
    #   token_type_ids: [可选] int32类型Tensor;
    #   token_type_vocab_size: int类型, `token_type_ids`词汇表的size;
    #   token_type_embedding_name: String. token-type-ids的embedding-table name;
    #   use_position_embeddings: bool, 是否为序列中每个token的位置添加位置embeddings;
    #   position_embedding_name: String, Position_Embedding name;
    #   initializer_range: float类型, 权重初始化的范围;
    #   max_position_embeddings: int类型, 最大序列长度, 可以比input_tensor长, 但不能比他短;
    #   dropout_prob: float, Dropout最终的大小

    # Returns:
    #   返回与"input_tensor"形状相同的Tensor;
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor

    if use_token_type:
        if token_type_ids is None:
            raise ValueError("如果`use_token_type`是True, `token_type_ids`必须被赋值.")
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range))
        # 由于这个vocab很小, 所以我们在这里使用one-hot, 对于小词汇量来说, one-hot更快;
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(flat_token_type_ids,
                                 depth=token_type_vocab_size)
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings,
                                           [batch_size, seq_length, width])
        output += token_type_embeddings

    if use_position_embeddings:
        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings, width],
                initializer=create_initializer(initializer_range))
            # 由于Position embedding table 是一个学习变量,
            # 我们使用(长)序列长度为"max_position_embeddings"创建它.
            # 实际的序列长度可能比这个要短, 以便更快地训练序列不长的任务.

            # 因此'full_position_embeddings'实际上是一个[0,1,2,…, max_position_embeddings-1],
            # 当前序列为[0,1,2,…seq_length-1],因此我们可以执行一个切片.
            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                           [seq_length, -1])
            num_dims = len(output.shape.as_list())

            # 只有最后两个维度和(`seq_length`和`width`相关.), 所以我们在第一个维度中进行了广播,
            # 一般来说只是batch_size;
            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(1)
            position_broadcast_shape.extend([seq_length, width])
            position_embeddings = tf.reshape(position_embeddings,
                                             position_broadcast_shape)
            output += position_embeddings

    output = layer_norm_and_dropout(output, dropout_prob)

    return output

    pass
Beispiel #41
0
def percentile(x,
               q,
               axis=None,
               interpolation=None,
               keep_dims=False,
               validate_args=False,
               name=None):
    """Compute the `q`-th percentile(s) of `x`.

  Given a vector `x`, the `q`-th percentile of `x` is the value `q / 100` of the
  way from the minimum to the maximum in a sorted copy of `x`.

  The values and distances of the two nearest neighbors as well as the
  `interpolation` parameter will determine the percentile if the normalized
  ranking does not match the location of `q` exactly.

  This function is the same as the median if `q = 50`, the same as the minimum
  if `q = 0` and the same as the maximum if `q = 100`.

  Multiple percentiles can be computed at once by using `1-D` vector `q`.
  Dimension zero of the returned `Tensor` will index the different percentiles.


  ```python
  # Get 30th percentile with default ('nearest') interpolation.
  x = [1., 2., 3., 4.]
  percentile(x, q=30.)
  ==> 2.0

  # Get 30th and 70th percentiles with 'lower' interpolation
  x = [1., 2., 3., 4.]
  percentile(x, q=[30., 70.], interpolation='lower')
  ==> [1., 3.]

  # Get 100th percentile (maximum).  By default, this is computed over every dim
  x = [[1., 2.]
       [3., 4.]]
  percentile(x, q=100.)
  ==> 4.

  # Treat the leading dim as indexing samples, and find the 100th quantile (max)
  # over all such samples.
  x = [[1., 2.]
       [3., 4.]]
  percentile(x, q=100., axis=[0])
  ==> [3., 4.]
  ```

  Compare to `numpy.percentile`.

  Args:
    x:  Floating point `N-D` `Tensor` with `N > 0`.  If `axis` is not `None`,
      `x` must have statically known number of dimensions.
    q:  Scalar or vector `Tensor` with values in `[0, 100]`. The percentile(s).
    axis:  Optional `0-D` or `1-D` integer `Tensor` with constant values. The
      axis that hold independent samples over which to return the desired
      percentile.  If `None` (the default), treat every dimension as a sample
      dimension, returning a scalar.
    interpolation : {'lower', 'higher', 'nearest'}.  Default: 'nearest' This
      optional parameter specifies the interpolation method to
      use when the desired quantile lies between two data points `i < j`:
        * lower: `i`.
        * higher: `j`.
        * nearest: `i` or `j`, whichever is nearest.
    keep_dims:  Python `bool`. If `True`, the last dimension is kept with size 1
      If `False`, the last dimension is removed from the output shape.
    validate_args:  Whether to add runtime checks of argument validity. If
      False, and arguments are incorrect, correct behavior is not guaranteed.
    name:  A Python string name to give this `Op`.  Default is 'percentile'

  Returns:
    A `(rank(q) + N - len(axis))` dimensional `Tensor` of same dtype as `x`, or,
      if `axis` is `None`, a `rank(q)` `Tensor`.  The first `rank(q)` dimensions
      index quantiles for different values of `q`.

  Raises:
    ValueError:  If argument 'interpolation' is not an allowed type.
  """
    name = name or 'percentile'
    allowed_interpolations = {'lower', 'higher', 'nearest'}

    if interpolation is None:
        interpolation = 'nearest'
    else:
        if interpolation not in allowed_interpolations:
            raise ValueError(
                'Argument `interpolation` must be in %s.  Found %s' %
                (allowed_interpolations, interpolation))

    with tf.name_scope(name, values=[x, q]):
        x = tf.convert_to_tensor(x, name='x')
        # Double is needed here and below, else we get the wrong index if the array
        # is huge along axis.
        q = tf.to_double(q, name='q')
        _get_static_ndims(q, expect_ndims_no_more_than=1)

        if validate_args:
            q = control_flow_ops.with_dependencies([
                tf.assert_rank_in(q, [0, 1]),
                tf.assert_greater_equal(q, tf.to_double(0.)),
                tf.assert_less_equal(q, tf.to_double(100.))
            ], q)

        if axis is None:
            y = tf.reshape(x, [-1])
        else:
            axis = tf.convert_to_tensor(axis, name='axis')
            tf.assert_integer(axis)
            axis_ndims = _get_static_ndims(axis,
                                           expect_static=True,
                                           expect_ndims_no_more_than=1)
            axis_const = tensor_util.constant_value(axis)
            if axis_const is None:
                raise ValueError(
                    'Expected argument `axis` to be statically available.  Found: %s'
                    % axis)
            axis = axis_const
            if axis_ndims == 0:
                axis = [axis]
            axis = [int(a) for a in axis]
            x_ndims = _get_static_ndims(x,
                                        expect_static=True,
                                        expect_ndims_at_least=1)
            axis = _make_static_axis_non_negative(axis, x_ndims)
            # Move dims in axis to the end, since _sort_tensor, which calls top_k,
            # only sorts the last dim.
            y = _move_dims_to_flat_end(x, axis, x_ndims)

        frac_at_q_or_above = 1. - q / 100.
        d = tf.to_double(tf.shape(y)[-1])

        if interpolation == 'lower':
            indices = tf.ceil((d - 1) * frac_at_q_or_above)
        elif interpolation == 'higher':
            indices = tf.floor((d - 1) * frac_at_q_or_above)
        elif interpolation == 'nearest':
            indices = tf.round((d - 1) * frac_at_q_or_above)

        # If d is gigantic, then we would have d == d - 1, even in double... So
        # let's use max/min to avoid out of bounds errors.
        d = tf.shape(y)[-1]
        # d - 1 will be distinct from d in int32.
        indices = tf.clip_by_value(tf.to_int32(indices), 0, d - 1)

        # Sort everything, not just the top 'k' entries, which allows multiple calls
        # to sort only once (under the hood) and use CSE.
        sorted_y = _sort_tensor(y)

        # Gather the indices along the sorted (last) dimension.
        # If q is a vector, the last dim of gathered_y indexes different q[i].
        gathered_y = tf.gather(sorted_y, indices, axis=-1)

        if keep_dims:
            if axis is None:
                ones_vec = tf.ones(shape=[
                    _get_best_effort_ndims(x) + _get_best_effort_ndims(q)
                ],
                                   dtype=tf.int32)
                gathered_y *= tf.ones(ones_vec, dtype=x.dtype)
            else:
                gathered_y = _insert_back_keep_dims(gathered_y, axis)

        # If q is a scalar, then result has the right shape.
        # If q is a vector, then result has trailing dim of shape q.shape, which
        # needs to be rotated to dim 0.
        return util.rotate_transpose(gathered_y, tf.rank(q))
Beispiel #42
0
def check_range(tensor, low, high, message_prefix=''):
    low = tf.assert_greater_equal(tensor, low, message=message_prefix + '>=')
    high = tf.assert_less_equal(tensor, high, message=message_prefix + '<=')
    with tf.control_dependencies([low, high]):
        return tf.identity(tensor)
Beispiel #43
0
def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=16,
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1):
  """Performs various post-processing on a word embedding tensor.

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length,
      embedding_size].
    use_token_type: bool. Whether to add embeddings for `token_type_ids`.
    token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      Must be specified if `use_token_type` is True.
    token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
    token_type_embedding_name: string. The name of the embedding table variable
      for token type ids.
    use_position_embeddings: bool. Whether to add position embeddings for the
      position of each token in the sequence.
    position_embedding_name: string. The name of the embedding table variable
      for positional embeddings.
    initializer_range: float. Range of the weight initialization.
    max_position_embeddings: int. Maximum sequence length that might ever be
      used with this model. This can be longer than the sequence length of
      input_tensor, but cannot be shorter.
    dropout_prob: float. Dropout probability applied to the final output tensor.

  Returns:
    float tensor with same shape as `input_tensor`.

  Raises:
    ValueError: One of the tensor shapes or input values is invalid.
  """
  input_shape = get_shape_list(input_tensor, expected_rank=3)
  batch_size = input_shape[0]
  seq_length = input_shape[1]
  width = input_shape[2]

  output = input_tensor

  if use_token_type:
    if token_type_ids is None:
      raise ValueError("`token_type_ids` must be specified if"
                       "`use_token_type` is True.")
    token_type_table = tf.get_variable(
        name=token_type_embedding_name,
        shape=[token_type_vocab_size, width],
        initializer=create_initializer(initializer_range))
    # This vocab will be small so we always do one-hot here, since it is always
    # faster for a small vocabulary.
    flat_token_type_ids = tf.reshape(token_type_ids, [-1])
    one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size)
    token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
    token_type_embeddings = tf.reshape(token_type_embeddings,
                                       [batch_size, seq_length, width])
    output += token_type_embeddings

  if use_position_embeddings:
    assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
    with tf.control_dependencies([assert_op]):
      full_position_embeddings = tf.get_variable(
          name=position_embedding_name,
          shape=[max_position_embeddings, width],
          initializer=create_initializer(initializer_range))
      # Since the position embedding table is a learned variable, we create it
      # using a (long) sequence length `max_position_embeddings`. The actual
      # sequence length might be shorter than this, for faster training of
      # tasks that do not have long sequences.
      #
      # So `full_position_embeddings` is effectively an embedding table
      # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
      # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
      # perform a slice.
      position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                     [seq_length, -1])
      num_dims = len(output.shape.as_list())

      # Only the last two dimensions are relevant (`seq_length` and `width`), so
      # we broadcast among the first dimensions, which is typically just
      # the batch size.
      position_broadcast_shape = []
      for _ in range(num_dims - 2):
        position_broadcast_shape.append(1)
      position_broadcast_shape.extend([seq_length, width])
      position_embeddings = tf.reshape(position_embeddings,
                                       position_broadcast_shape)
      output += position_embeddings

  output = layer_norm_and_dropout(output, dropout_prob)
  return output
Beispiel #44
0
def tower(inputs,
          is_training,
          dropout_probability,
          input_noise,
          normalize_input,
          flip_horizontally,
          translate,
          num_logits,
          is_initialization=False,
          name=None):
    with tf.name_scope(name, "tower"):
        default_conv_args = dict(padding='SAME',
                                 kernel_size=[3, 3],
                                 activation_fn=nn.lrelu,
                                 init=is_initialization)
        training_mode_funcs = [
            nn.random_translate, nn.flip_randomly, nn.gaussian_noise,
            slim.dropout, wn.fully_connected, wn.conv2d
        ]
        training_args = dict(is_training=is_training)

        with \
        slim.arg_scope([wn.conv2d], **default_conv_args), \
        slim.arg_scope(training_mode_funcs, **training_args):
            #pylint: disable=no-value-for-parameter
            net = inputs
            assert_shape(net, [None, 32, 32, 3])

            net = tf.cond(
                normalize_input, lambda: slim.layer_norm(
                    net, scale=False, center=False, scope='normalize_inputs'),
                lambda: net)
            assert_shape(net, [None, 32, 32, 3])

            net = nn.flip_randomly(net,
                                   horizontally=flip_horizontally,
                                   vertically=False,
                                   name='random_flip')
            net = tf.cond(
                translate, lambda: nn.random_translate(
                    net, scale=2, name='random_translate'), lambda: net)
            net = nn.gaussian_noise(net,
                                    scale=input_noise,
                                    name='gaussian_noise')

            net = wn.conv2d(net, 128, scope="conv_1_1")
            net = wn.conv2d(net, 128, scope="conv_1_2")
            net = wn.conv2d(net, 128, scope="conv_1_3")
            net = slim.max_pool2d(net, [2, 2], scope='max_pool_1')
            net = slim.dropout(net,
                               1 - dropout_probability,
                               scope='dropout_probability_1')
            assert_shape(net, [None, 16, 16, 128])

            net = wn.conv2d(net, 256, scope="conv_2_1")
            net = wn.conv2d(net, 256, scope="conv_2_2")
            net = wn.conv2d(net, 256, scope="conv_2_3")
            net = slim.max_pool2d(net, [2, 2], scope='max_pool_2')
            net = slim.dropout(net,
                               1 - dropout_probability,
                               scope='dropout_probability_2')
            assert_shape(net, [None, 8, 8, 256])

            net = wn.conv2d(net, 512, padding='VALID', scope="conv_3_1")
            assert_shape(net, [None, 6, 6, 512])
            net = wn.conv2d(net, 256, kernel_size=[1, 1], scope="conv_3_2")
            net = wn.conv2d(net, 128, kernel_size=[1, 1], scope="conv_3_3")
            net = slim.avg_pool2d(net, [6, 6], scope='avg_pool')
            assert_shape(net, [None, 1, 1, 128])

            net = slim.flatten(net)
            assert_shape(net, [None, 128])

            primary_logits = wn.fully_connected(net,
                                                10,
                                                init=is_initialization)
            secondary_logits = wn.fully_connected(net,
                                                  10,
                                                  init=is_initialization)

            with tf.control_dependencies([
                    tf.assert_greater_equal(num_logits, 1),
                    tf.assert_less_equal(num_logits, 2)
            ]):
                secondary_logits = tf.case([
                    (tf.equal(num_logits, 1), lambda: primary_logits),
                    (tf.equal(num_logits, 2), lambda: secondary_logits),
                ],
                                           exclusive=True,
                                           default=lambda: primary_logits)

            assert_shape(primary_logits, [None, 10])
            assert_shape(secondary_logits, [None, 10])
            return primary_logits, secondary_logits
Beispiel #45
0
def embedding_postprocessor(input_tensor,
                            use_position_embeddings=True,
                            position1_ids=None,
                            position2_ids=None,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1):
    """Performs various post-processing on a word embedding tensor.

    Args:
      input_tensor: float Tensor of shape [batch_size, seq_length,
        embedding_size].
      use_position_embeddings: bool. Whether to add position embeddings for the
        position of each token in the sequence.
      position_embedding_name: string. The name of the embedding table variable
        for positional embeddings.
      initializer_range: float. Range of the weight initialization.
      max_position_embeddings: int. Maximum sequence length that might ever be
        used with this model. This can be longer than the sequence length of
        input_tensor, but cannot be shorter.
      dropout_prob: float. Dropout probability applied to the final output tensor.

    Returns:
      float tensor with same shape as `input_tensor`.

    Raises:
      ValueError: One of the tensor shapes or input values is invalid.
    """
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    embedding_size = input_shape[2]

    output = input_tensor

    if use_position_embeddings:

        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings * 2, embedding_size],
                initializer=create_initializer(initializer_range))
            # Since the position embedding table is a learned variable, we create it
            # using a (long) sequence length `max_position_embeddings`. The actual
            # sequence length might be shorter than this, for faster training of
            # tasks that do not have long sequences.
            #
            # So `full_position_embeddings` is effectively an embedding table
            # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
            # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
            # perform a slice.
            # position_embeddings = tf.slice(full_position_embeddings, [0, 0],
            #                                [seq_length, -1])
            if position1_ids == None or position2_ids == None:
                ValueError('You need input the position information.')
            flat_position1_ids = tf.reshape(position1_ids, shape=[-1])
            flat_position2_ids = tf.reshape(position2_ids, shape=[-1])
            position1_embeddings = tf.nn.embedding_lookup(
                full_position_embeddings, flat_position1_ids)
            position2_embeddings = tf.nn.embedding_lookup(
                full_position_embeddings, flat_position2_ids)
            # num_dims = len(output.shape.as_list())
            position_embeddings = position1_embeddings + position2_embeddings
            position_embeddings = tf.reshape(
                position_embeddings,
                shape=[batch_size, seq_length, embedding_size])

            # Only the last two dimensions are relevant (`seq_length` and `width`), so
            # we broadcast among the first dimensions, which is typically just
            # the batch size.
            # position_broadcast_shape = []
            # for _ in range(num_dims - 2):
            #     position_broadcast_shape.append(1)
            # position_broadcast_shape.extend([seq_length, embedding_size])
            # position_embeddings = tf.reshape(position_embeddings,
            #                                  position_broadcast_shape)
            output += position_embeddings
    output = layer_norm_and_dropout(output, dropout_prob)
    return output
Beispiel #46
0
def preprocess_data(sequence_id, sequence, audio, velocity_range, hparams,
                    is_training):
    """Compute spectral representation, labels, and length from sequence/audio.

  Args:
    sequence_id: id of the sequence.
    sequence: String tensor containing serialized NoteSequence proto.
    audio: String tensor containing containing WAV data.
    velocity_range: String tensor containing max and min velocities of file as a
      serialized VelocityRange.
    hparams: HParams object specifying hyperparameters.
    is_training: Whether or not this is a training run.

  Returns:
    An InputTensors tuple.

  Raises:
    ValueError: If hparams is contains an invalid spec_type.
  """

    wav_jitter_amount_ms = label_jitter_amount_ms = 0
    # if there is combined jitter, we must generate it once here
    if is_training and hparams.jitter_amount_ms > 0:
        wav_jitter_amount_ms = np.random.choice(hparams.jitter_amount_ms,
                                                size=1)
        label_jitter_amount_ms = wav_jitter_amount_ms

    if label_jitter_amount_ms > 0:
        sequence = jitter_label_op(sequence, label_jitter_amount_ms / 1000.)

    # possibly shift the entire sequence backward for better forward only training
    if hparams.backward_shift_amount_ms > 0:
        sequence = jitter_label_op(sequence,
                                   hparams.backward_shift_amount_ms / 1000.)

    if is_training:
        audio = transform_wav_data_op(audio,
                                      hparams=hparams,
                                      jitter_amount_sec=wav_jitter_amount_ms /
                                      1000.)

    spec = wav_to_spec_op(audio, hparams=hparams)

    labels, label_weights, onsets, offsets, velocities = sequence_to_pianoroll_op(
        sequence, velocity_range, hparams=hparams)

    length = wav_to_num_frames_op(audio, hparams_frames_per_second(hparams))

    asserts = []
    if hparams.max_expected_train_example_len and is_training:
        asserts.append(
            tf.assert_less_equal(length,
                                 hparams.max_expected_train_example_len))

    with tf.control_dependencies(asserts):
        return InputTensors(spec=spec,
                            labels=labels,
                            label_weights=label_weights,
                            length=length,
                            onsets=onsets,
                            offsets=offsets,
                            velocities=velocities,
                            sequence_id=sequence_id,
                            note_sequence=sequence)
Beispiel #47
0
def embedding_postprocessor(input_tensor,
                            use_token_type=False,
                            token_type_ids=None,
                            token_type_vocab_size=2,
                            token_type_embedding_name="token_type_embeddings",
                            use_position_embeddings=True,
                            position_embedding_name="position_embeddings",
                            initializer_range=0.02,
                            max_position_embeddings=512,
                            dropout_prob=0.1):

    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    width = input_shape[2]

    output = input_tensor
    if use_token_type:
        if token_type_ids is None:
            raise ValueError("`token_type_ids` must be specified if"
                             "`use_token_type` is True.")
        token_type_table = tf.get_variable(
            name=token_type_embedding_name,
            shape=[token_type_vocab_size, width],
            initializer=create_initializer(initializer_range))
        # This vocab will be small so we always do one-hot here, since it is always
        # faster for a small vocabulary.
        flat_token_type_ids = tf.reshape(token_type_ids, [-1])
        one_hot_ids = tf.one_hot(
            flat_token_type_ids, depth=token_type_vocab_size)
        token_type_embeddings = tf.matmul(one_hot_ids, token_type_table)
        token_type_embeddings = tf.reshape(token_type_embeddings,
                                           [batch_size, seq_length, width])
        output += token_type_embeddings

    if use_position_embeddings:
        assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)
        with tf.control_dependencies([assert_op]):
            full_position_embeddings = tf.get_variable(
                name=position_embedding_name,
                shape=[max_position_embeddings, width],
                initializer=create_initializer(initializer_range))
            # Since the position embedding table is a learned variable, we create it
            # using a (long) sequence length `max_position_embeddings`. The actual
            # sequence length might be shorter than this, for faster training of
            # tasks that do not have long sequences.
            #
            # So `full_position_embeddings` is effectively an embedding table
            # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
            # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
            # perform a slice.
            position_embeddings = tf.slice(full_position_embeddings, [0, 0],
                                           [seq_length, -1])
            num_dims = len(output.shape.as_list())

            # Only the last two dimensions are relevant (`seq_length` and `width`), so
            # we broadcast among the first dimensions, which is typically just
            # the batch size.
            position_broadcast_shape = []
            for _ in range(num_dims - 2):
                position_broadcast_shape.append(1)
            position_broadcast_shape.extend([seq_length, width])
            # position_embeddings : [1,seq_length,width]
            position_embeddings = tf.reshape(position_embeddings,
                                             position_broadcast_shape)
            # output : [batch_size, seq_length, width], broadcast position_embeddings
            output += position_embeddings

    output = layer_norm_and_dropout(output, dropout_prob)
    return output
def fgsm_generate(x, model, y=None, eps=0.3, ord=np.inf, clip_min=None, clip_max=None, clip_grad=False, targeted=False, sanity_checks=True):

  asserts = []

  # If a data range was specified, check that the input was in that range
  if clip_min is not None:
    asserts.append(tf.assert_greater_equal(x, tf.cast(clip_min, x.dtype)))

  if clip_max is not None:
    asserts.append(tf.assert_less_equal(x, tf.cast(clip_max, x.dtype)))

  logits = model(x)._op.inputs[0]

  if y is None:
    # Using model predictions as ground truth to avoid label leaking
    preds_max = reduce_max(logits, 1, keepdims=True)
    y = tf.to_float(tf.equal(logits, preds_max))
    y = tf.stop_gradient(y)
  y = y / reduce_sum(y, 1, keepdims=True)

  # Compute loss
  
  #################
  ##   CE-loss  ###
  #################
  loss = softmax_cross_entropy_with_logits(labels=y, logits=logits)
  if targeted:
    loss = -loss
  
#  ##################
#  ###   CW-loss  ###
#  ##################  
#  logits_sort = tf.contrib.framework.sort(logits, axis=1, direction="DESCENDING")
#  logits_max = tf.gather(logits_sort, axis=1, indices=[0])
#  logits_secondmax = tf.gather(logits_sort, axis=1, indices=[1])  
#  
#  logits_loss = logits_max - logits_secondmax
#  loss = -tf.reduce_mean(logits_loss)
#  if targeted:
#      loss = -loss
      
#  ##################
#  ###   DLR-loss  ###
#  ################## 
#  logits_sort = tf.contrib.framework.sort(logits, axis=1, direction="DESCENDING")
#  logits_max = tf.gather(logits_sort, axis=1, indices=[0])
#  logits_secondmax = tf.gather(logits_sort, axis=1, indices=[1])  
#  logits_thirdmax = tf.gather(logits_sort, axis=1, indices=[2])  
#  
#  logits_loss = tf.divide(logits_max - logits_secondmax, logits_max - logits_thirdmax + 1e12)
#  
#  loss = -tf.reduce_mean(logits_loss)
#  if targeted:
#      loss = -loss
      
  # Define gradient of loss wrt input
  grad, = tf.gradients(loss, x)

  if clip_grad:
    grad = zero_out_clipped_grads(grad, x, clip_min, clip_max)

  optimal_perturbation = optimize_linear(grad, eps, ord)

  # Add perturbation to original example to obtain adversarial example
  adv_x = x + optimal_perturbation

  # If clipping is needed, reset all values outside of [clip_min, clip_max]
  if (clip_min is not None) or (clip_max is not None):
    # We don't currently support one-sided clipping
    assert clip_min is not None and clip_max is not None
    adv_x = clip_by_value(adv_x, clip_min, clip_max)

  if sanity_checks:
    with tf.control_dependencies(asserts):
      adv_x = tf.identity(adv_x)

  return adv_x
  def __init__(self,
               mean_direction,
               concentration,
               validate_args=False,
               allow_nan_stats=True,
               name='VonMisesFisher'):
    """Creates a new `VonMisesFisher` instance.

    Args:
      mean_direction: Floating-point `Tensor` with shape [B1, ... Bn, D].
        A unit vector indicating the mode of the distribution, or the
        unit-normalized direction of the mean. (This is *not* in general the
        mean of the distribution; the mean is not generally in the support of
        the distribution.) NOTE: `D` is currently restricted to <= 5.
      concentration: Floating-point `Tensor` having batch shape [B1, ... Bn]
        broadcastable with `mean_direction`. The level of concentration of
        samples around the `mean_direction`. `concentration=0` indicates a
        uniform distribution over the unit hypersphere, and `concentration=+inf`
        indicates a `Deterministic` distribution (delta function) at
        `mean_direction`.
      validate_args: Python `bool`, default `False`. When `True` distribution
        parameters are checked for validity despite possibly degrading runtime
        performance. When `False` invalid inputs may silently render incorrect
        outputs.
      allow_nan_stats: Python `bool`, default `True`. When `True`,
        statistics (e.g., mean, mode, variance) use the value "`NaN`" to
        indicate the result is undefined. When `False`, an exception is raised
        if one or more of the statistic's batch members are undefined.
      name: Python `str` name prefixed to Ops created by this class.

    Raises:
      ValueError: For known-bad arguments, i.e. unsupported event dimension.
    """
    parameters = dict(locals())
    with tf.name_scope(name, values=[mean_direction, concentration]) as name:
      dtype = dtype_util.common_dtype([mean_direction, concentration],
                                      tf.float32)
      mean_direction = tf.convert_to_tensor(
          mean_direction, name='mean_direction', dtype=dtype)
      concentration = tf.convert_to_tensor(
          concentration, name='concentration', dtype=dtype)
      assertions = [
          tf.assert_non_negative(
              concentration, message='`concentration` must be non-negative'),
          tf.assert_greater(
              tf.shape(mean_direction)[-1], 1,
              message='`mean_direction` may not have scalar event shape'),
          tf.assert_near(
              1., tf.linalg.norm(mean_direction, axis=-1),
              message='`mean_direction` must be unit-length')
      ] if validate_args else []
      if mean_direction.shape.with_rank_at_least(1)[-1].value is not None:
        if mean_direction.shape.with_rank_at_least(1)[-1].value > 5:
          raise ValueError('vMF ndims > 5 is not currently supported')
      elif validate_args:
        assertions += [tf.assert_less_equal(
            tf.shape(mean_direction)[-1], 5,
            message='vMF ndims > 5 is not currently supported')]
      with tf.control_dependencies(assertions):
        self._mean_direction = tf.identity(mean_direction)
        self._concentration = tf.identity(concentration)
      tf.assert_same_float_dtype([self._mean_direction, self._concentration])
      # mean_direction is always reparameterized.
      # concentration is only for event_dim==3, via an inversion sampler.
      reparameterization_type = (
          reparameterization.FULLY_REPARAMETERIZED
          if mean_direction.shape.with_rank_at_least(1)[-1].value == 3 else
          reparameterization.NOT_REPARAMETERIZED)
      super(VonMisesFisher, self).__init__(
          dtype=self._concentration.dtype,
          validate_args=validate_args,
          allow_nan_stats=allow_nan_stats,
          reparameterization_type=reparameterization_type,
          parameters=parameters,
          graph_parents=[self._mean_direction, self._concentration],
          name=name)
  def _sample_n(self, n, seed=None):
    seed = seed_stream.SeedStream(seed, salt='vom_mises_fisher')
    # The sampling strategy relies on the fact that vMF variates are symmetric
    # about the mean direction. Accordingly, if we have a sampling strategy for
    # the away-from-mean angle, then we can uniformly sample the remaining
    # dimensions on the S^{dim-2} sphere for , and rotate these samples from a
    # (1, 0, 0, ..., 0)-mode distribution into the target orientation.
    #
    # This is easy to imagine on the 1-sphere (S^1; in 2-D space): sample a
    # von-Mises distributed `x` value in [-1, 1], then uniformly select what
    # amounts to a "up" or "down" additional degree of freedom after unit
    # normalizing, followed by a final rotation to the desired mean direction
    # from a basis of (1, 0).
    #
    # On S^2 (in 3-D), selecting a vMF `x` identifies a circle in `yz` on the
    # unit sphere over which the distribution is uniform, in particular the
    # circle where x = \hat{x} intersects the unit sphere. We pick a point on
    # that circle, then rotate to the desired mean direction from a basis of
    # (1, 0, 0).
    event_dim = self.event_shape[0].value or self._event_shape_tensor()[0]

    sample_batch_shape = tf.concat([[n], self._batch_shape_tensor()], axis=0)
    dim = tf.cast(event_dim - 1, self.dtype)
    if event_dim == 3:
      samples_dim0 = self._sample_3d(n, seed=seed)
    else:
      # Wood'94 provides a rejection algorithm to sample the x coordinate.
      # Wood'94 definition of b:
      # b = (-2 * kappa + tf.sqrt(4 * kappa**2 + dim**2)) / dim
      # https://stats.stackexchange.com/questions/156729 suggests:
      b = dim / (2 * self.concentration +
                 tf.sqrt(4 * self.concentration**2 + dim**2))
      # TODO(bjp): Integrate any useful numerical tricks from hyperspherical VAE
      #     https://github.com/nicola-decao/s-vae-tf/
      x = (1 - b) / (1 + b)
      c = self.concentration * x + dim * tf.log1p(-x**2)
      beta = beta_lib.Beta(dim / 2, dim / 2)

      def cond_fn(w, should_continue):
        del w
        return tf.reduce_any(should_continue)

      def body_fn(w, should_continue):
        z = beta.sample(sample_shape=sample_batch_shape, seed=seed())
        w = tf.where(should_continue, (1 - (1 + b) * z) / (1 - (1 - b) * z), w)
        w = tf.check_numerics(w, 'w')
        should_continue = tf.logical_and(
            should_continue,
            self.concentration * w + dim * tf.log1p(-x * w) - c <
            tf.log(tf.random_uniform(sample_batch_shape, seed=seed(),
                                     dtype=self.dtype)))
        return w, should_continue

      w = tf.zeros(sample_batch_shape, dtype=self.dtype)
      should_continue = tf.ones(sample_batch_shape, dtype=tf.bool)
      samples_dim0 = tf.while_loop(cond_fn, body_fn, (w, should_continue))[0]
      samples_dim0 = samples_dim0[..., tf.newaxis]
    if not self._allow_nan_stats:
      # Verify samples are w/in -1, 1, with useful error output tensors (top
      # value rather than all values).
      with tf.control_dependencies([
          tf.assert_less_equal(
              samples_dim0, self.dtype.as_numpy_dtype(1.01),
              data=[tf.nn.top_k(tf.reshape(samples_dim0, [-1]))[0]]),
          tf.assert_greater_equal(
              samples_dim0, self.dtype.as_numpy_dtype(-1.01),
              data=[-tf.nn.top_k(tf.reshape(-samples_dim0, [-1]))[0]])]):
        samples_dim0 = tf.identity(samples_dim0)
    samples_otherdims_shape = tf.concat([sample_batch_shape, [event_dim - 1]],
                                        axis=0)
    unit_otherdims = tf.nn.l2_normalize(
        tf.random_normal(samples_otherdims_shape, seed=seed(),
                         dtype=self.dtype),
        axis=-1)
    samples = tf.concat([
        samples_dim0,  # we must avoid sqrt(1 - (>1)**2)
        tf.sqrt(tf.maximum(1 - samples_dim0**2, 0.)) * unit_otherdims
    ], axis=-1)
    samples = tf.nn.l2_normalize(samples, axis=-1)
    if not self._allow_nan_stats:
      samples = tf.check_numerics(samples, 'samples')

    # Runtime assert that samples are unit length.
    if not self._allow_nan_stats:
      worst, idx = tf.nn.top_k(
          tf.reshape(tf.abs(1 - tf.linalg.norm(samples, axis=-1)), [-1]))
      with tf.control_dependencies([
          tf.assert_near(
              self.dtype.as_numpy_dtype(0), worst,
              data=[worst, idx,
                    tf.gather(tf.reshape(samples, [-1, event_dim]), idx)],
              atol=1e-4, summarize=100)]):
        samples = tf.identity(samples)
    # The samples generated are symmetric around a mode at (1, 0, 0, ...., 0).
    # Now, we move the mode to `self.mean_direction` using a rotation matrix.
    if not self._allow_nan_stats:
      # Assert that the basis vector rotates to the mean direction, as expected.
      basis = tf.cast(tf.concat([[1.], tf.zeros([event_dim - 1])], axis=0),
                      self.dtype)
      with tf.control_dependencies([
          tf.assert_less(
              tf.linalg.norm(self._rotate(basis) - self.mean_direction,
                             axis=-1),
              self.dtype.as_numpy_dtype(1e-5))
      ]):
        return self._rotate(samples)
    return self._rotate(samples)
Beispiel #51
0
    def __init__(self,
                 batch_size,
                 total_num_examples,
                 max_learning_rate=1.,
                 preconditioner_decay_rate=0.95,
                 burnin=25,
                 burnin_max_learning_rate=1e-6,
                 use_single_learning_rate=False,
                 name=None,
                 variable_scope=None):
        default_name = 'VariationalSGD'
        with tf.name_scope(name, default_name, [
                max_learning_rate, preconditioner_decay_rate, batch_size,
                burnin, burnin_max_learning_rate
        ]):
            if variable_scope is None:
                var_scope_name = tf.get_default_graph().unique_name(
                    name or default_name)
                with tf.variable_scope(var_scope_name) as scope:
                    self._variable_scope = scope
            else:
                self._variable_scope = variable_scope

            self._preconditioner_decay_rate = tf.convert_to_tensor(
                preconditioner_decay_rate, name='preconditioner_decay_rate')
            self._batch_size = tf.convert_to_tensor(batch_size,
                                                    name='batch_size')
            self._total_num_examples = tf.convert_to_tensor(
                total_num_examples, name='total_num_examples')
            self._burnin = tf.convert_to_tensor(burnin, name='burnin')
            self._burnin_max_learning_rate = tf.convert_to_tensor(
                burnin_max_learning_rate, name='burnin_max_learning_rate')
            self._max_learning_rate = tf.convert_to_tensor(
                max_learning_rate, name='max_learning_rate')
            self._use_single_learning_rate = use_single_learning_rate

            with tf.variable_scope(self._variable_scope):
                self._counter = tf.get_variable('counter',
                                                initializer=0,
                                                trainable=False)

            self._preconditioner_decay_rate = control_flow_ops.with_dependencies([
                tf.assert_non_negative(
                    self._preconditioner_decay_rate,
                    message='`preconditioner_decay_rate` must be non-negative'
                ),
                tf.assert_less_equal(
                    self._preconditioner_decay_rate,
                    1.,
                    message='`preconditioner_decay_rate` must be at most 1.'),
            ], self._preconditioner_decay_rate)

            self._batch_size = control_flow_ops.with_dependencies([
                tf.assert_greater(
                    self._batch_size,
                    0,
                    message='`batch_size` must be greater than zero')
            ], self._batch_size)

            self._total_num_examples = control_flow_ops.with_dependencies([
                tf.assert_greater(
                    self._total_num_examples,
                    0,
                    message='`total_num_examples` must be greater than zero')
            ], self._total_num_examples)

            self._burnin = control_flow_ops.with_dependencies([
                tf.assert_non_negative(
                    self._burnin, message='`burnin` must be non-negative'),
                tf.assert_integer(self._burnin,
                                  message='`burnin` must be an integer')
            ], self._burnin)

            self._burnin_max_learning_rate = control_flow_ops.with_dependencies([
                tf.assert_non_negative(
                    self._burnin_max_learning_rate,
                    message='`burnin_max_learning_rate` must be non-negative')
            ], self._burnin_max_learning_rate)

            self._max_learning_rate = control_flow_ops.with_dependencies([
                tf.assert_non_negative(
                    self._max_learning_rate,
                    message='`max_learning_rate` must be non-negative')
            ], self._max_learning_rate)

            super(VariationalSGD, self).__init__(use_locking=False,
                                                 name=name or default_name)
Beispiel #52
0
    def __init__(self,
                 df,
                 scale_operator,
                 input_output_cholesky=False,
                 validate_args=False,
                 allow_nan_stats=True,
                 name=None):
        """Construct Wishart distributions.

    Args:
      df: `float` or `double` tensor, the degrees of freedom of the
        distribution(s). `df` must be greater than or equal to `k`.
      scale_operator: `float` or `double` instance of `LinearOperator`.
      input_output_cholesky: Python `bool`. If `True`, functions whose input or
        output have the semantics of samples assume inputs are in Cholesky form
        and return outputs in Cholesky form. In particular, if this flag is
        `True`, input to `log_prob` is presumed of Cholesky form and output from
        `sample`, `mean`, and `mode` are of Cholesky form.  Setting this
        argument to `True` is purely a computational optimization and does not
        change the underlying distribution; for instance, `mean` returns the
        Cholesky of the mean, not the mean of Cholesky factors. The `variance`
        and `stddev` methods are unaffected by this flag.
        Default value: `False` (i.e., input/output does not have Cholesky
        semantics).
      validate_args: Python `bool`, default `False`. When `True` distribution
        parameters are checked for validity despite possibly degrading runtime
        performance. When `False` invalid inputs may silently render incorrect
        outputs.
      allow_nan_stats: Python `bool`, default `True`. When `True`, statistics
        (e.g., mean, mode, variance) use the value "`NaN`" to indicate the
        result is undefined. When `False`, an exception is raised if one or
        more of the statistic's batch members are undefined.
      name: Python `str` name prefixed to Ops created by this class.

    Raises:
      TypeError: if scale is not floating-type
      TypeError: if scale.dtype != df.dtype
      ValueError: if df < k, where scale operator event shape is
        `(k, k)`
    """
        parameters = dict(locals())
        self._input_output_cholesky = input_output_cholesky
        with tf.name_scope(name) as name:
            with tf.name_scope("init", values=[df, scale_operator]):
                if not scale_operator.dtype.is_floating:
                    raise TypeError(
                        "scale_operator.dtype=%s is not a floating-point type"
                        % scale_operator.dtype)
                if not scale_operator.is_square:
                    print(scale_operator.to_dense().eval())
                    raise ValueError("scale_operator must be square.")

                self._scale_operator = scale_operator
                self._df = tf.convert_to_tensor(df,
                                                dtype=scale_operator.dtype,
                                                name="df")
                contrib_tensor_util.assert_same_float_dtype(
                    (self._df, self._scale_operator))
                if (self._scale_operator.shape.ndims is None
                        or self._scale_operator.shape[-1].value is None):
                    self._dimension = tf.cast(
                        self._scale_operator.domain_dimension_tensor(),
                        dtype=self._scale_operator.dtype,
                        name="dimension")
                else:
                    self._dimension = tf.convert_to_tensor(
                        self._scale_operator.shape[-1].value,
                        dtype=self._scale_operator.dtype,
                        name="dimension")
                df_val = tensor_util.constant_value(self._df)
                dim_val = tensor_util.constant_value(self._dimension)
                if df_val is not None and dim_val is not None:
                    df_val = np.asarray(df_val)
                    if not df_val.shape:
                        df_val = [df_val]
                    if any(df_val < dim_val):
                        raise ValueError(
                            "Degrees of freedom (df = %s) cannot be less than "
                            "dimension of scale matrix (scale.dimension = %s)"
                            % (df_val, dim_val))
                elif validate_args:
                    assertions = tf.assert_less_equal(
                        self._dimension,
                        self._df,
                        message=("Degrees of freedom (df = %s) cannot be "
                                 "less than dimension of scale matrix "
                                 "(scale.dimension = %s)" %
                                 (self._dimension, self._df)))
                    self._df = control_flow_ops.with_dependencies([assertions],
                                                                  self._df)
        super(_WishartLinearOperator, self).__init__(
            dtype=self._scale_operator.dtype,
            validate_args=validate_args,
            allow_nan_stats=allow_nan_stats,
            reparameterization_type=tf.distributions.FULLY_REPARAMETERIZED,
            parameters=parameters,
            graph_parents=([self._df, self._dimension] +
                           self._scale_operator.graph_parents),
            name=name)
Beispiel #53
0
    def __init__(self,
                 learning_rate,
                 preconditioner_decay_rate=0.95,
                 data_size=1,
                 burnin=25,
                 diagonal_bias=1e-8,
                 name=None,
                 parallel_iterations=10,
                 variable_scope=None):
        default_name = 'StochasticGradientLangevinDynamics'
        with tf.name_scope(name, default_name, [
                learning_rate, preconditioner_decay_rate, data_size, burnin,
                diagonal_bias
        ]):
            if tf.executing_eagerly():
                raise NotImplementedError(
                    'Eager execution currently not supported for '
                    ' SGLD optimizer.')
            if variable_scope is None:
                var_scope_name = tf.get_default_graph().unique_name(
                    name or default_name)
                with tf.variable_scope(var_scope_name) as scope:
                    self._variable_scope = scope
            else:
                self._variable_scope = variable_scope

            self._preconditioner_decay_rate = tf.convert_to_tensor(
                preconditioner_decay_rate, name='preconditioner_decay_rate')
            self._data_size = tf.convert_to_tensor(data_size, name='data_size')
            self._burnin = tf.convert_to_tensor(burnin, name='burnin')
            self._diagonal_bias = tf.convert_to_tensor(diagonal_bias,
                                                       name='diagonal_bias')
            self._learning_rate = tf.convert_to_tensor(learning_rate,
                                                       name='learning_rate')
            self._parallel_iterations = parallel_iterations

            with tf.variable_scope(self._variable_scope):
                self._counter = tf.get_variable('counter',
                                                initializer=0,
                                                trainable=False)

            self._preconditioner_decay_rate = control_flow_ops.with_dependencies([
                tf.assert_non_negative(
                    self._preconditioner_decay_rate,
                    message='`preconditioner_decay_rate` must be non-negative'
                ),
                tf.assert_less_equal(
                    self._preconditioner_decay_rate,
                    1.,
                    message='`preconditioner_decay_rate` must be at most 1.'),
            ], self._preconditioner_decay_rate)

            self._data_size = control_flow_ops.with_dependencies([
                tf.assert_greater(
                    self._data_size,
                    0,
                    message='`data_size` must be greater than zero')
            ], self._data_size)

            self._burnin = control_flow_ops.with_dependencies([
                tf.assert_non_negative(
                    self._burnin, message='`burnin` must be non-negative'),
                tf.assert_integer(self._burnin,
                                  message='`burnin` must be an integer')
            ], self._burnin)

            self._diagonal_bias = control_flow_ops.with_dependencies([
                tf.assert_non_negative(
                    self._diagonal_bias,
                    message='`diagonal_bias` must be non-negative')
            ], self._diagonal_bias)

            super(StochasticGradientLangevinDynamics,
                  self).__init__(use_locking=False, name=name or default_name)
Beispiel #54
0
def percentile(x,
               q,
               axis=None,
               interpolation=None,
               keep_dims=False,
               validate_args=False,
               name=None):
    """Compute the `q`-th percentile of `x`.

  Given a vector `x`, the `q`-th percentile of `x` is the value `q / 100` of the
  way from the minimum to the maximum in a sorted copy of `x`.

  The values and distances of the two nearest neighbors as well as the
  `interpolation` parameter will determine the percentile if the normalized
  ranking does not match the location of `q` exactly.

  This function is the same as the median if `q = 50`, the same as the minimum
  if `q = 0` and the same as the maximum if `q = 100`.


  ```python
  # Get 30th percentile with default ('nearest') interpolation.
  x = [1., 2., 3., 4.]
  percentile(x, q=30.)
  ==> 2.0

  # Get 30th percentile with 'lower' interpolation
  x = [1., 2., 3., 4.]
  percentile(x, q=30., interpolation='lower')
  ==> 1.0

  # Get 100th percentile (maximum).  By default, this is computed over every dim
  x = [[1., 2.]
       [3., 4.]]
  percentile(x, q=100.)
  ==> 4.0

  # Treat the leading dim as indexing samples, and find the 100th quantile (max)
  # over all such samples.
  x = [[1., 2.]
       [3., 4.]]
  percentile(x, q=100., axis=[0])
  ==> [3., 4.]
  ```

  Compare to `numpy.percentile`.

  Args:
    x:  Floating point `N-D` `Tensor` with `N > 0`.  If `axis` is not `None`,
      `x` must have statically known number of dimensions.
    q:  Scalar `Tensor` in `[0, 100]`. The percentile.
    axis:  Optional `0-D` or `1-D` integer `Tensor` with constant values.
      The axis that hold independent samples over which to return the desired
      percentile.  If `None` (the default), treat every dimension as a sample
      dimension, returning a scalar.
    interpolation : {"lower", "higher", "nearest"}.  Default: "nearest"
      This optional parameter specifies the interpolation method to
      use when the desired quantile lies between two data points `i < j`:
        * lower: `i`.
        * higher: `j`.
        * nearest: `i` or `j`, whichever is nearest.
    keep_dims:  Python `bool`. If `True`, the last dimension is kept with size 1
      If `False`, the last dimension is removed from the output shape.
    validate_args:  Whether to add runtime checks of argument validity.
      If False, and arguments are incorrect, correct behavior is not guaranteed.
    name:  A Python string name to give this `Op`.  Default is "percentile"

  Returns:
    A `(N - len(axis))` dimensional `Tensor` of same dtype as `x`, or, if
      `axis` is `None`, a scalar.

  Raises:
    ValueError:  If argument 'interpolation' is not an allowed type.
  """
    name = name or "percentile"
    allowed_interpolations = {"lower", "higher", "nearest"}

    if interpolation is None:
        interpolation = "nearest"
    else:
        if interpolation not in allowed_interpolations:
            raise ValueError(
                "Argument 'interpolation' must be in %s.  Found %s" %
                (allowed_interpolations, interpolation))

    with tf.name_scope(name, [x, q]):
        x = tf.convert_to_tensor(x, name="x")
        # Double is needed here and below, else we get the wrong index if the array
        # is huge along axis.
        q = tf.to_double(q, name="q")
        _get_static_ndims(q, expect_ndims=0)

        if validate_args:
            q = control_flow_ops.with_dependencies([
                tf.assert_rank(q, 0),
                tf.assert_greater_equal(q, tf.to_double(0.)),
                tf.assert_less_equal(q, tf.to_double(100.))
            ], q)

        if axis is None:
            y = tf.reshape(x, [-1])
        else:
            axis = tf.convert_to_tensor(axis, name="axis")
            tf.assert_integer(axis)
            axis_ndims = _get_static_ndims(axis,
                                           expect_static=True,
                                           expect_ndims_no_more_than=1)
            axis_const = tensor_util.constant_value(axis)
            if axis_const is None:
                raise ValueError(
                    "Expected argument 'axis' to be statically available.  Found: %s"
                    % axis)
            axis = axis_const
            if axis_ndims == 0:
                axis = [axis]
            axis = [int(a) for a in axis]
            x_ndims = _get_static_ndims(x,
                                        expect_static=True,
                                        expect_ndims_at_least=1)
            axis = _make_static_axis_non_negative(axis, x_ndims)
            y = _move_dims_to_flat_end(x, axis, x_ndims)

        frac_at_q_or_above = 1. - q / 100.
        d = tf.to_double(tf.shape(y)[-1])

        if interpolation == "lower":
            index = tf.ceil((d - 1) * frac_at_q_or_above)
        elif interpolation == "higher":
            index = tf.floor((d - 1) * frac_at_q_or_above)
        elif interpolation == "nearest":
            index = tf.round((d - 1) * frac_at_q_or_above)

        # If d is gigantic, then we would have d == d - 1, even in double... So
        # let's use max/min to avoid out of bounds errors.
        d = tf.shape(y)[-1]
        # d - 1 will be distinct from d in int32.
        index = tf.clip_by_value(tf.to_int32(index), 0, d - 1)

        # Sort everything, not just the top 'k' entries, which allows multiple calls
        # to sort only once (under the hood) and use CSE.
        sorted_y = _sort_tensor(y)

        # result.shape = B
        result = sorted_y[..., index]
        result.set_shape(y.get_shape()[:-1])

        if keep_dims:
            if axis is None:
                # ones_vec = [1, 1,..., 1], total length = len(S) + len(B).
                ones_vec = tf.ones(shape=[_get_best_effort_ndims(x)],
                                   dtype=tf.int32)
                result *= tf.ones(ones_vec, dtype=x.dtype)
            else:
                result = _insert_back_keep_dims(result, axis)

        return result
Beispiel #55
0
def embed(input_ids,
          vocab_size,
          embedding_size,
          position_offset=0,
          initializer_range=0.02,
          max_position_embeddings=512,
          use_one_hot_embeddings=True):
    """reur and position embeddings
    :param input_ids: int Tensor of shape [batch_size, seq_length].
    :param vocab_size: number of words in vocab
    :param embedding_size: dimensionality of the embedding
    :param position_offset: aka number of cached tokens.
    :param initializer_range: float. Range of the weight initialization.
    :param max_position_embeddings: int. Maximum sequence length.
    :param use_one_hot_embeddings: probably want this to be true
    :return: [batch_size, seq_length, embedding_size] embedded tensor
    """
    (batch_size, seq_length) = get_shape_list(input_ids, expected_rank=2)

    embedding_table = tf.get_variable(
        name='word_embed',
        shape=[vocab_size, embedding_size],
        initializer=create_initializer(initializer_range),
    )

    assert_op = tf.assert_less_equal(tf.reduce_max(input_ids), vocab_size - 1)
    with tf.control_dependencies([assert_op]):
        if use_one_hot_embeddings:
            flat_input_ids = tf.reshape(input_ids, [-1])
            one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size)
            output_flat = tf.matmul(one_hot_input_ids, embedding_table)
        else:
            output_flat = tf.nn.embedding_lookup(embedding_table, input_ids)

        embedded_input = tf.reshape(output_flat,
                                    [batch_size, seq_length, embedding_size])

    assert_op = tf.assert_less_equal(seq_length, max_position_embeddings)

    with tf.control_dependencies([assert_op]):
        full_position_embeddings = tf.get_variable(
            name='pos_embed',
            shape=[max_position_embeddings, embedding_size],
            initializer=create_initializer(initializer_range),
        )
        # Since the position embedding table is a learned variable, we create it
        # using a (long) sequence length `max_position_embeddings`. The actual
        # sequence length might be shorter than this, for faster training of
        # tasks that do not have long sequences.
        #
        # So `full_position_embeddings` is effectively an embedding table
        # for position [0, 1, 2, ..., max_position_embeddings-1], and the current
        # sequence has positions [0, 1, 2, ... seq_length-1], so we can just
        # perform a slice.
        if position_offset == 0:
            embedded_input += tf.slice(full_position_embeddings, [0, 0],
                                       [seq_length, -1])[None]
        else:
            # Tensorflow is too stupid to allow slicing
            flat_pos_ids = (tf.range(seq_length, dtype=tf.int32) +
                            position_offset)
            one_hot_pos_ids = tf.one_hot(flat_pos_ids,
                                         depth=max_position_embeddings)

            # [seq_length, full_position_embeddings], [full_position_embeddings, dim]
            seq_embeds = tf.matmul(one_hot_pos_ids, full_position_embeddings)
            embedded_input += seq_embeds[None]

            # embedded_input += tf.slice(full_position_embeddings[position_offset:], [0, 0], [seq_length, -1])[None]

    return layer_norm(embedded_input, name='embed_norm'), embedding_table
Beispiel #56
0
    def __call__(self, inputs, state, scope=None):
        (
            past_cand_symbols, # [batch_size, max_len]
            past_cand_logprobs,# [batch_size]
            past_beam_symbols, # [batch_size*self.beam_size, max_len], right-aligned!!!
            past_beam_logprobs,# [batch_size*self.beam_size]
            past_cell_state,
                ) = state

        batch_size = tf.shape(past_cand_logprobs)[0] # TODO: get as int, if possible

        full_size = batch_size * self.beam_size

        cell_inputs = inputs
        cell_outputs, raw_cell_state = self.cell(cell_inputs, past_cell_state)

        logprobs = tf.nn.log_softmax(cell_outputs)

        logprobs_batched = tf.reshape(logprobs + tf.expand_dims(past_beam_logprobs, 1),
                                      [-1, self.beam_size * self.num_classes])
        logprobs_batched.set_shape((None, self.beam_size * self.num_classes))

        # prints and asserts
        tf.assert_less_equal(logprobs, 0.0)
        tf.assert_less_equal(past_beam_logprobs, 0.0)

        masked_logprobs = tf.reshape(logprobs_batched, [-1, self.beam_size * self.num_classes])
        # print masked_logprobs.get_shape()

        beam_logprobs, indices = tf.nn.top_k(
            masked_logprobs,
            self.beam_size
        )

        beam_logprobs = tf.reshape(beam_logprobs, [-1])

        # For continuing to the next symbols
        symbols = indices % self.num_classes # [batch_size, self.beam_size]
        parent_refs = tf.reshape(indices // self.num_classes, [-1]) # [batch_size*self.beam_size]

        # TODO: this technically doesn't need to be recalculated every loop
        parent_refs_offsets = tf.mul(tf.floordiv(tf.range(full_size), self.beam_size), self.beam_size)
        parent_refs = parent_refs + parent_refs_offsets

        if past_beam_symbols is not None:
            symbols_history = tf.gather(past_beam_symbols, parent_refs)
            beam_symbols = tf.concat(1, [tf.reshape(symbols, [-1, 1]), symbols_history])
        else:
            beam_symbols = tf.reshape(symbols, [-1, 1])

        # Above ends up outputting reversed. Below doesn't work though because tf doesn't support negative indexing.
        # last = past_beam_symbols.get_shape()[1]
        # symbols_history = tf.gather(past_beam_symbols[:,last - 1], parent_refs)
        # beam_symbols = tf.concat(1, [past_beam_symbols[:,:last-1], tf.reshape(symbols_history, [-1, 1]), tf.reshape(symbols, [-1, 1]), ])

        # Handle the output and the cell state shuffling
        outputs = tf.reshape(symbols, [-1]) # [batch_size*beam_size, 1]
        cell_state = nest_map(
            lambda element: tf.gather(element, parent_refs),
            raw_cell_state
        )

        # Handling for getting a done token
        # logprobs_done = tf.reshape(logprobs_batched, [-1, self.beam_size, self.num_classes])[:,:,self.stop_token]
        # done_parent_refs = tf.to_int32(tf.argmax(logprobs_done, 1))
        # done_parent_refs_offsets = tf.range(batch_size) * self.beam_size
        # done_symbols = tf.gather(past_beam_symbols, done_parent_refs + done_parent_refs_offsets)

        # logprobs_done_max = tf.reduce_max(logprobs_done, 1)
        # cand_symbols = tf.select(logprobs_done_max > past_cand_logprobs,
        #                         done_symbols,
        #                         past_cand_symbols)
        # cand_logprobs = tf.maximum(logprobs_done_max, past_cand_logprobs)
        cand_symbols = past_cand_symbols # current last symbol in the beam [batch_size*self.beam_size]
        cand_logprobs = past_cand_logprobs

        return outputs, (
            cand_symbols,
            cand_logprobs,
            beam_symbols,
            beam_logprobs,
            cell_state,
        )
def model_fn(features, labels, mode, params):
    image = features['image']
    num_classes = params['model']['num_classes']
    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    # build convolutional layers
    conv = build_conv_layers(image, params['model']['conv_layers'], is_training)

    # load convolutional and dense layers from a checkpoint
    freeze_variables = {}
    checkpoint_path = params['training'].get('checkpoint_path')
    freeze_restored_variables = params['training'].get('freeze_restored_variables', False)
    if checkpoint_path:
        tvars = tf.trainable_variables()
        assignment_map = {}
        for var in tvars:
            assignment_map[var.name[:-2]] = var
            if freeze_restored_variables:
                freeze_variables[var.name] = True

        tf.train.init_from_checkpoint(root_dir(checkpoint_path), assignment_map)

    # build dense layers
    dense = build_dense_layers(conv, params['model']['dense_layers'], is_training)

    # get logits
    if 'subnet' in params:
        # build NN for each neuron
        subnet_dropout_rate = params['model']['subnet'].get('subnet_dropout_rate', 0)
        if subnet_dropout_rate:
            dense = tf.layers.dropout(inputs=dense, rate=subnet_dropout_rate, training=is_training)

        logits_layer_params = dict(params['model']['logits_layer'])
        logits_layer_params['num_units'] = 1

        logits_concat = []
        for i in range(num_classes):
            subnet_dense = build_dense_layers(dense, params['model']['subnet']['dense_layers'], is_training)
            subnet_logits = build_dense_layers(subnet_dense, [logits_layer_params], is_training)
            logits_concat.append(subnet_logits)

        logits = tf.concat(logits_concat, axis=-1)
    else:
        # a single layer to get a spike
        logits_layer_params = dict(params['model']['logits_layer'])
        logits_layer_params['num_units'] = num_classes
        logits = build_dense_layers(dense, [logits_layer_params], is_training)

    # return prediction specification
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions={'spikes': logits})

    # make sure that images were distorted correctly and display them in TensorBoard
    max_images = 12
    images = image[:max_images]
    assert_min = tf.assert_greater_equal(tf.reduce_min(images), 0.0, message='Image contains values less than 0')
    assert_max = tf.assert_less_equal(tf.reduce_max(images), 1.0, message='Image contains values greater than 1')
    with tf.control_dependencies([assert_min, assert_max]):
        tf.summary.image('images', tf.cast(images * 255, dtype=tf.uint8), max_outputs=max_images)

    # compute the loss
    nan_mask = tf.cast(features['nan_mask'], tf.float32)
    mse_loss = tf.losses.mean_squared_error(labels=labels, predictions=logits, weights=nan_mask)
    loss = mse_loss + tf.losses.get_regularization_loss()

    # get train variables
    train_vars = [var for var in tf.trainable_variables() if var.name not in freeze_variables]

    # return training specification
    if mode == tf.estimator.ModeKeys.TRAIN:
        train_op = tf.contrib.layers.optimize_loss(
            loss=loss,
            global_step=tf.train.get_global_step(),
            learning_rate=params['training']['learning_rate'],
            optimizer='Adam',
            summaries=['learning_rate', 'loss', 'gradients', 'gradient_norm'],
            variables=train_vars,
        )

        # perform update ops for batch normalization
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        train_op = tf.group([train_op, update_ops])

        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)

    # evaluation metrics
    eval_metric_ops = {
        'rmse': tf.metrics.root_mean_squared_error(labels=labels, predictions=logits, weights=nan_mask),
    }

    # RMSE per column
    for i in range(num_classes):
        eval_metric_ops['rmse/column%d' % i] = tf.metrics.root_mean_squared_error(labels=labels[:, i],
                                                                                  predictions=logits[:, i],
                                                                                  weights=nan_mask[:, i])

    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
Beispiel #58
0
 def test_doesnt_raise_when_equal(self):
     with self.test_session():
         small = tf.constant([1, 2], name="small")
         with tf.control_dependencies([tf.assert_less_equal(small, small)]):
             out = tf.identity(small)
         out.eval()
Beispiel #59
0
 def test_doesnt_raise_when_equal(self):
   with self.test_session():
     small = tf.constant([1, 2], name="small")
     with tf.control_dependencies([tf.assert_less_equal(small, small)]):
       out = tf.identity(small)
     out.eval()