def replace(self, episodes, length, rows=None): """Replace full episodes. Args: episodes: Tuple of transition quantities with batch and time dimensions. length: Batch of sequence lengths. rows: Episodes to replace, defaults to all. Returns: Operation. """ rows = tf.range(self._capacity) if rows is None else rows assert rows.shape.ndims == 1 assert_capacity = tf.assert_less( rows, self._capacity, message='capacity exceeded') with tf.control_dependencies([assert_capacity]): assert_max_length = tf.assert_less_equal( length, self._max_length, message='max length exceeded') replace_ops = [] with tf.control_dependencies([assert_max_length]): for buffer_, elements in zip(self._buffers, episodes): replace_op = tf.scatter_update(buffer_, rows, elements) replace_ops.append(replace_op) with tf.control_dependencies(replace_ops): return tf.scatter_update(self._length, rows, length)
def preprocess_for_inception(images): """Preprocess images for inception. Args: images: images minibatch. Shape [batch size, width, height, channels]. Values are in [0..255]. Returns: preprocessed_images """ # Images should have 3 channels. assert images.shape[3].value == 3 # tfgan_eval.preprocess_image function takes values in [0, 1], so rescale. with tf.control_dependencies([tf.assert_greater_equal(images, 0.0), tf.assert_less_equal(images, 255.0)]): images = tf.identity(images) preprocessed_images = tf.map_fn( fn=tfgan_eval.preprocess_image, elems=images, back_prop=False ) return preprocessed_images
def new_mean_squared(grad_vec, decay, ms): """Calculates the new accumulated mean squared of the gradient. Args: grad_vec: the vector for the current gradient decay: the decay term ms: the previous mean_squared value Returns: the new mean_squared value """ decay_size = decay.get_shape().num_elements() decay_check_ops = [ tf.assert_less_equal(decay, 1., summarize=decay_size), tf.assert_greater_equal(decay, 0., summarize=decay_size)] with tf.control_dependencies(decay_check_ops): grad_squared = tf.square(grad_vec) # If the previous mean_squared is the 0 vector, don't use the decay and just # return the full grad_squared. This should only happen on the first timestep. decay = tf.cond(tf.reduce_all(tf.equal(ms, 0.)), lambda: tf.zeros_like(decay, dtype=tf.float32), lambda: decay) # Update the running average of squared gradients. epsilon = 1e-12 return (1. - decay) * (grad_squared + epsilon) + decay * ms
def calculate_reshape(original_shape, new_shape, validate=False, name=None): """Calculates the reshaped dimensions (replacing up to one -1 in reshape).""" batch_shape_static = tensor_util.constant_value_as_shape(new_shape) if batch_shape_static.is_fully_defined(): return np.int32(batch_shape_static.as_list()), batch_shape_static, [] with tf.name_scope(name, "calculate_reshape", [original_shape, new_shape]): original_size = tf.reduce_prod(original_shape) implicit_dim = tf.equal(new_shape, -1) size_implicit_dim = ( original_size // tf.maximum(1, -tf.reduce_prod(new_shape))) new_ndims = tf.shape(new_shape) expanded_new_shape = tf.where( # Assumes exactly one `-1`. implicit_dim, tf.fill(new_ndims, size_implicit_dim), new_shape) validations = [] if not validate else [ tf.assert_rank( original_shape, 1, message="Original shape must be a vector."), tf.assert_rank(new_shape, 1, message="New shape must be a vector."), tf.assert_less_equal( tf.count_nonzero(implicit_dim, dtype=tf.int32), 1, message="At most one dimension can be unknown."), tf.assert_positive( expanded_new_shape, message="Shape elements must be >=-1."), tf.assert_equal( tf.reduce_prod(expanded_new_shape), original_size, message="Shape sizes do not match."), ] return expanded_new_shape, batch_shape_static, validations
def test_doesnt_raise_when_both_empty(self): with self.test_session(): larry = tf.constant([]) curly = tf.constant([]) with tf.control_dependencies([tf.assert_less_equal(larry, curly)]): out = tf.identity(larry) out.eval()
def test_doesnt_raise_when_less_equal_and_broadcastable_shapes(self): with self.test_session(): small = tf.constant([1], name="small") big = tf.constant([3, 1], name="big") with tf.control_dependencies([tf.assert_less_equal(small, big)]): out = tf.identity(small) out.eval()
def _maybe_check_valid_shape(self, shape, validate_args): """Check that a shape Tensor is int-type and otherwise sane.""" if not shape.dtype.is_integer: raise TypeError('{} dtype ({}) should be `int`-like.'.format( shape, shape.dtype.name)) assertions = [] ndims = tf.rank(shape) ndims_ = tensor_util.constant_value(ndims) if ndims_ is not None and ndims_ > 1: raise ValueError('`{}` rank ({}) should be <= 1.'.format( shape, ndims_)) elif validate_args: assertions.append( tf.assert_less_equal( ndims, 1, message='`{}` rank should be <= 1.'.format(shape))) # Note, we might be inclined to use tensor_util.constant_value_as_shape # here, but that method coerces negative values into `None`s, rendering the # checks we do below impossible. shape_tensor_ = tensor_util.constant_value(shape) if shape_tensor_ is not None: es = np.int32(shape_tensor_) if sum(es == -1) > 1: raise ValueError( '`{}` must have at most one `-1` (given {})' .format(shape, es)) if np.any(es < -1): raise ValueError( '`{}` elements must be either positive integers or `-1`' '(given {}).' .format(shape, es)) elif validate_args: assertions.extend([ tf.assert_less_equal( tf.reduce_sum(tf.cast(tf.equal(shape, -1), tf.int32)), 1, message='`{}` elements must have at most one `-1`.' .format(shape)), tf.assert_greater_equal( shape, -1, message='`{}` elements must be either positive integers or `-1`.' .format(shape)), ]) return assertions
def test_raises_when_greater(self): with self.test_session(): small = tf.constant([1, 2], name="small") big = tf.constant([3, 4], name="big") with tf.control_dependencies([tf.assert_less_equal(big, small)]): out = tf.identity(small) with self.assertRaisesOpError("big.*small"): out.eval()
def remidify(pitches): """Transforms [0, 88) to MIDI pitches [21, 108].""" assertions = [ tf.assert_greater_equal(pitches, 0), tf.assert_less_equal(pitches, 87) ] with tf.control_dependencies(assertions): return pitches + 21
def demidify(pitches): """Transforms MIDI pitches [21,108] to [0, 88).""" assertions = [ tf.assert_greater_equal(pitches, 21), tf.assert_less_equal(pitches, 108) ] with tf.control_dependencies(assertions): return pitches - 21
def _augment_data(self, inout, nchan=6): """Flip, crop and rotate samples randomly.""" with tf.name_scope('data_augmentation'): if self.fliplr: inout = tf.image.random_flip_left_right(inout, seed=1234) if self.flipud: inout = tf.image.random_flip_up_down(inout, seed=3456) if self.rotate: angle = tf.random_uniform((), minval=0, maxval=4, dtype=tf.int32, seed=4567) inout = tf.case([(tf.equal(angle, 1), lambda: tf.image.rot90(inout, k=1)), (tf.equal(angle, 2), lambda: tf.image.rot90(inout, k=2)), (tf.equal(angle, 3), lambda: tf.image.rot90(inout, k=3))], lambda: inout) inout.set_shape([None, None, nchan]) with tf.name_scope('crop'): shape = tf.shape(inout) new_height = tf.to_int32(self.output_resolution[0]) new_width = tf.to_int32(self.output_resolution[1]) height_ok = tf.assert_less_equal(new_height, shape[0]) width_ok = tf.assert_less_equal(new_width, shape[1]) with tf.control_dependencies([height_ok, width_ok]): if self.random_crop: inout = tf.random_crop( inout, tf.stack([new_height, new_width, nchan])) else: height_offset = tf.to_int32((shape[0]-new_height)/2) width_offset = tf.to_int32((shape[1]-new_width)/2) inout = tf.image.crop_to_bounding_box( inout, height_offset, width_offset, new_height, new_width) inout.set_shape([None, None, nchan]) inout = tf.image.resize_images( inout, [self.output_resolution[0], self.output_resolution[1]]) fullres = inout with tf.name_scope('resize'): new_size = 256 inout = tf.image.resize_images( inout, [new_size, new_size], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) return fullres, inout
def test_raises_when_less_equal_but_non_broadcastable_shapes(self): with self.test_session(): small = tf.constant([1, 1, 1], name="small") big = tf.constant([3, 1], name="big") with self.assertRaisesRegexp(ValueError, "broadcast"): with tf.control_dependencies([tf.assert_less_equal(small, big)]): out = tf.identity(small) out.eval()
def scale_to_inception_range(image): """Scales an image in the range [0,1] to [-1,1] as expected by inception.""" # Assert that incoming images have been properly scaled to [0,1]. with tf.control_dependencies( [tf.assert_less_equal(tf.reduce_max(image), 1.), tf.assert_greater_equal(tf.reduce_min(image), 0.)]): image = tf.subtract(image, 0.5) image = tf.multiply(image, 2.0) return image
def _maybe_assert_valid_y(self, y): if not self.validate_args: return y is_positive = tf.assert_non_negative( y, message="Inverse transformation input must be greater than 0.") less_than_one = tf.assert_less_equal( y, tf.constant(1., y.dtype), message="Inverse transformation input must be less than or equal to 1.") return control_flow_ops.with_dependencies([is_positive, less_than_one], y)
def _maybe_assert_valid(self, x): if not self.validate_args: return x return control_flow_ops.with_dependencies([ tf.assert_non_negative(x, message="sample must be non-negative"), tf.assert_less_equal( x, tf.ones([], self.concentration0.dtype), message="sample must be no larger than `1`."), ], x)
def _maybe_assert_valid_sample(self, counts): """Check counts for proper shape, values, then return tensor version.""" if not self.validate_args: return counts counts = distribution_util.embed_check_nonnegative_integer_form(counts) return control_flow_ops.with_dependencies([ tf.assert_less_equal( counts, self.total_count, message="counts are not less than or equal to n."), ], counts)
def _validate_correlationness(self, x): if not self.validate_args: return x checks = [ tf.assert_less_equal( tf.cast(-1., dtype=x.dtype.base_dtype), x, message='Correlations must be >= -1.'), tf.assert_less_equal( x, tf.cast(1., x.dtype.base_dtype), message='Correlations must be <= 1.'), tf.assert_near( tf.matrix_diag_part(x), tf.cast(1., x.dtype.base_dtype), message='Self-correlations must be = 1.'), tf.assert_near( x, tf.matrix_transpose(x), message='Correlation matrices must be symmetric') ] with tf.control_dependencies(checks): return tf.identity(x)
def maybe_split_sequence_lengths(sequence_length, num_splits, total_length): """Validates and splits `sequence_length`, if necessary. Returned value must be used in graph for all validations to be executed. Args: sequence_length: A batch of sequence lengths, either sized `[batch_size]` and equal to either 0 or `total_length`, or sized `[batch_size, num_splits]`. num_splits: The scalar number of splits of the full sequences. total_length: The scalar total sequence length (potentially padded). Returns: sequence_length: If input shape was `[batch_size, num_splits]`, returns the same Tensor. Otherwise, returns a Tensor of that shape with each input length in the batch divided by `num_splits`. Raises: ValueError: If `sequence_length` is not shaped `[batch_size]` or `[batch_size, num_splits]`. tf.errors.InvalidArgumentError: If `sequence_length` is shaped `[batch_size]` and all values are not either 0 or `total_length`. """ if sequence_length.shape.ndims == 1: if total_length % num_splits != 0: raise ValueError( '`total_length` must be evenly divisible by `num_splits`.') with tf.control_dependencies( [tf.Assert( tf.reduce_all( tf.logical_or(tf.equal(sequence_length, 0), tf.equal(sequence_length, total_length))), data=[sequence_length])]): sequence_length = ( tf.tile(tf.expand_dims(sequence_length, axis=1), [1, num_splits]) // num_splits) elif sequence_length.shape.ndims == 2: with tf.control_dependencies([ tf.assert_less_equal( sequence_length, tf.constant(total_length // num_splits, tf.int32), message='Segment length cannot be more than ' '`total_length / num_splits`.')]): sequence_length = tf.identity(sequence_length) sequence_length.set_shape([sequence_length.shape[0], num_splits]) else: raise ValueError( 'Sequence lengths must be given as a vector or a 2D Tensor whose ' 'second dimension size matches its initial hierarchical split. Got ' 'shape: %s' % sequence_length.shape.as_list()) return sequence_length
def _maximum_mean(samples, envelope, high, name=None): """Returns a stochastic upper bound on the mean of a scalar distribution. The idea is that if the true CDF is within an `eps`-envelope of the empirical CDF of the samples, and the support is bounded above, then the mean is bounded above as well. In symbols, ```none sup_x(|F_n(x) - F(x)|) < eps ``` The 0th dimension of `samples` is interpreted as independent and identically distributed samples. The remaining dimensions are broadcast together with `envelope` and `high`, and operated on separately. Args: samples: Floating-point `Tensor` of samples from the distribution(s) of interest. Entries are assumed IID across the 0th dimension. The other dimensions must broadcast with `envelope` and `high`. envelope: Floating-point `Tensor` of sizes of admissible CDF envelopes (i.e., the `eps` above). high: Floating-point `Tensor` of upper bounds on the distributions' supports. `samples <= high`. name: A name for this operation (optional). Returns: bound: Floating-point `Tensor` of upper bounds on the true means. Raises: InvalidArgumentError: If some `sample` is found to be larger than the corresponding `high`. """ with tf.name_scope(name, "maximum_mean", [samples, envelope, high]): dtype = dtype_util.common_dtype([samples, envelope, high], tf.float32) samples = tf.convert_to_tensor(samples, name="samples", dtype=dtype) envelope = tf.convert_to_tensor(envelope, name="envelope", dtype=dtype) high = tf.convert_to_tensor(high, name="high", dtype=dtype) xmax = tf.reduce_max(samples, axis=[0]) msg = "Given sample maximum value exceeds expectations" check_op = tf.assert_less_equal(xmax, high, message=msg) with tf.control_dependencies([check_op]): return tf.identity(_do_maximum_mean(samples, envelope, high))
def _init_clusters_random(self): """Does random initialization of clusters. Returns: Tensor of randomly initialized clusters. """ num_data = tf.add_n([tf.shape(inp)[0] for inp in self._inputs]) # Note that for mini-batch k-means, we should ensure that the batch size of # data used during initialization is sufficiently large to avoid duplicated # clusters. with tf.control_dependencies( [tf.assert_less_equal(self._num_clusters, num_data)]): indices = tf.random_uniform(tf.reshape(self._num_clusters, [-1]), minval=0, maxval=tf.cast(num_data, tf.int64), seed=self._random_seed, dtype=tf.int64) clusters_init = embedding_lookup(self._inputs, indices, partition_strategy='div') return clusters_init
def _init_clusters_random(data, num_clusters, random_seed): """Does random initialization of clusters. Args: data: a list of Tensors with a matrix of data, each row is an example. num_clusters: an integer with the number of clusters. random_seed: Seed for PRNG used to initialize seeds. Returns: A Tensor with num_clusters random rows of data. """ assert isinstance(data, list) num_data = tf.add_n([tf.shape(inp)[0] for inp in data]) with tf.control_dependencies([tf.assert_less_equal(num_clusters, num_data)]): indices = tf.random_uniform([num_clusters], minval=0, maxval=tf.cast(num_data, tf.int64), seed=random_seed, dtype=tf.int64) indices = tf.cast(indices, tf.int32) % num_data clusters_init = embedding_lookup(data, indices, partition_strategy='div') return clusters_init
def _make_runtime_assertions( self, distribution, reinterpreted_batch_ndims, validate_args): assertions = [] static_reinterpreted_batch_ndims = tf.contrib.util.constant_value( reinterpreted_batch_ndims) batch_ndims = distribution.batch_shape.ndims if batch_ndims is not None and static_reinterpreted_batch_ndims is not None: if static_reinterpreted_batch_ndims > batch_ndims: raise ValueError("reinterpreted_batch_ndims({}) cannot exceed " "distribution.batch_ndims({})".format( static_reinterpreted_batch_ndims, batch_ndims)) elif validate_args: batch_shape = distribution.batch_shape_tensor() batch_ndims = ( batch_shape.shape[0].value if batch_shape.shape.with_rank_at_least(1)[0].value is not None else tf.shape(batch_shape)[0]) assertions.append( tf.assert_less_equal( reinterpreted_batch_ndims, batch_ndims, message=("reinterpreted_batch_ndims cannot exceed " "distribution.batch_ndims"))) return assertions
def replace(self, episodes, length, rows=None): """Replace full episodes. Args: episodes: Tuple of transition quantities with batch and time dimensions. length: Batch of sequence lengths. rows: Episodes to replace, defaults to all. Returns: Operation. """ rows = tf.range(self._capacity) if rows is None else rows assert rows.shape.ndims == 1 assert_capacity = tf.assert_less( rows, self._capacity, message='capacity exceeded') with tf.control_dependencies([assert_capacity]): assert_max_length = tf.assert_less_equal( length, self._max_length, message='max length exceeded') with tf.control_dependencies([assert_max_length]): replace_ops = tools.nested.map( lambda var, val: tf.scatter_update(var, rows, val), self._buffers, episodes, flatten=True) with tf.control_dependencies(replace_ops): return tf.scatter_update(self._length, rows, length)
def embedding_postprocessor(input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=16, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1): """Performs various post-processing on a word embedding tensor. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_token_type: bool. Whether to add embeddings for `token_type_ids`. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. Must be specified if `use_token_type` is True. token_type_vocab_size: int. The vocabulary size of `token_type_ids`. token_type_embedding_name: string. The name of the embedding table variable for token type ids. use_position_embeddings: bool. Whether to add position embeddings for the position of each token in the sequence. position_embedding_name: string. The name of the embedding table variable for positional embeddings. initializer_range: float. Range of the weight initialization. max_position_embeddings: int. Maximum sequence length that might ever be used with this model. This can be longer than the sequence length of input_tensor, but cannot be shorter. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float tensor with same shape as `input_tensor`. Raises: ValueError: One of the tensor shapes or input values is invalid. """ input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor if use_token_type: if token_type_ids is None: raise ValueError("`token_type_ids` must be specified if" "`use_token_type` is True.") token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range)) # This vocab will be small so we always do one-hot here, since it is always # faster for a small vocabulary. flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings if use_position_embeddings: assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range)) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(output.shape.as_list()) # Only the last two dimensions are relevant (`seq_length` and `width`), so # we broadcast among the first dimensions, which is typically just # the batch size. position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) output += position_embeddings output = layer_norm_and_dropout(output, dropout_prob) return output
def __init__(self, df, scale_operator, input_output_cholesky=False, validate_args=False, allow_nan_stats=True, name=None): """Construct Wishart distributions. Args: df: `float` or `double` tensor, the degrees of freedom of the distribution(s). `df` must be greater than or equal to `k`. scale_operator: `float` or `double` instance of `LinearOperator`. input_output_cholesky: Python `bool`. If `True`, functions whose input or output have the semantics of samples assume inputs are in Cholesky form and return outputs in Cholesky form. In particular, if this flag is `True`, input to `log_prob` is presumed of Cholesky form and output from `sample`, `mean`, and `mode` are of Cholesky form. Setting this argument to `True` is purely a computational optimization and does not change the underlying distribution; for instance, `mean` returns the Cholesky of the mean, not the mean of Cholesky factors. The `variance` and `stddev` methods are unaffected by this flag. Default value: `False` (i.e., input/output does not have Cholesky semantics). validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: TypeError: if scale is not floating-type TypeError: if scale.dtype != df.dtype ValueError: if df < k, where scale operator event shape is `(k, k)` """ parameters = dict(locals()) self._input_output_cholesky = input_output_cholesky with tf.name_scope(name) as name: with tf.name_scope("init", values=[df, scale_operator]): if not scale_operator.dtype.is_floating: raise TypeError( "scale_operator.dtype=%s is not a floating-point type" % scale_operator.dtype) if not scale_operator.is_square: print(scale_operator.to_dense().eval()) raise ValueError("scale_operator must be square.") self._scale_operator = scale_operator self._df = tf.convert_to_tensor( df, dtype=scale_operator.dtype, name="df") contrib_tensor_util.assert_same_float_dtype( (self._df, self._scale_operator)) if (self._scale_operator.shape.ndims is None or self._scale_operator.shape[-1].value is None): self._dimension = tf.cast( self._scale_operator.domain_dimension_tensor(), dtype=self._scale_operator.dtype, name="dimension") else: self._dimension = tf.convert_to_tensor( self._scale_operator.shape[-1].value, dtype=self._scale_operator.dtype, name="dimension") df_val = tensor_util.constant_value(self._df) dim_val = tensor_util.constant_value(self._dimension) if df_val is not None and dim_val is not None: df_val = np.asarray(df_val) if not df_val.shape: df_val = [df_val] if any(df_val < dim_val): raise ValueError( "Degrees of freedom (df = %s) cannot be less than " "dimension of scale matrix (scale.dimension = %s)" % (df_val, dim_val)) elif validate_args: assertions = tf.assert_less_equal( self._dimension, self._df, message=("Degrees of freedom (df = %s) cannot be " "less than dimension of scale matrix " "(scale.dimension = %s)" % (self._dimension, self._df))) self._df = control_flow_ops.with_dependencies( [assertions], self._df) super(_WishartLinearOperator, self).__init__( dtype=self._scale_operator.dtype, validate_args=validate_args, allow_nan_stats=allow_nan_stats, reparameterization_type=tf.distributions.FULLY_REPARAMETERIZED, parameters=parameters, graph_parents=( [self._df, self._dimension] + self._scale_operator.graph_parents), name=name)
def embedding_postprocessor(input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=16, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1): """对词向量进行后处理 Args: input_tensor: float张量,形状为[batch_size, seq_length,embedding_size],词向量 use_token_type: bool. 是否添加token的类型向量 token_type_ids: (可选) int32张量,形状为[batch_size, seq_length],use_token_type为True时必要有 token_type_vocab_size: int. token类型的数量 token_type_embedding_name: string. token的类型向量表的名字 use_position_embeddings: bool. 是否添加位置向量 position_embedding_name: string. 位置向量表的名字 initializer_range: float. 初始化的范围参数 max_position_embeddings: int. 位置向量的最大长度,只能比输入序列更长 dropout_prob: float. 最后输出的丢弃概率 Returns: 跟输入维度一致的float张量 Raises: ValueError: 张量形状或者输入值无效 """ # 获取输入张量维度,batch_size,seq_length,width(词向量的维度) input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor # 初始化输出张量 if use_token_type: # 加上token类型向量 if token_type_ids is None: raise ValueError("`token_type_ids` must be specified if" "`use_token_type` is True.") # token类型向量表变量 token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range)) # 因为类型词表总是很小,所以直接使用one-hot的方式获取向量,因为这种方式在小词表时总是更快 flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings # 直接将token类型向量加到输出上 if use_position_embeddings: # 加上位置向量 assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): # 位置向量表变量 full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range)) # full_position_embeddings已经建立了0到max_position_embeddings-1位置上的向量, # 为了获取0到seq_length-1位置上的向量,只要使用slice操作即可 position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(output.shape.as_list()) # 只有最后两维是相关的(`seq_length` and `width`), 所以只要广播开始的维度,通常是batch_size的维度 position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend( [seq_length, width]) # position_broadcast_shape=[1, seq_length, width] position_embeddings = tf.reshape( position_embeddings, position_broadcast_shape) # [1, seq_length, width] output += position_embeddings # 直接将位置向量加到输出上 # 先进行层标准化再dropout output = layer_norm_and_dropout(output, dropout_prob) return output
def transformer_model(input_tensor, is_training, attention_mask=None, hidden_size=768, num_hidden_layers=1, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=gelu, hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2, initializer_range=0.02, do_return_all_layers=False, use_position_embeddings=True, max_position_embeddings=512): """Multi-headed, multi-layer Transformer from "Attention is All You Need". This is almost an exact implementation of the original Transformer encoder. See the original paper: https://arxiv.org/abs/1706.03762 Also see: https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py Args: input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length], with 1 for positions that can be attended to and 0 in positions that should not be. hidden_size: int. Hidden size of the Transformer. num_hidden_layers: int. Number of layers (blocks) in the Transformer. num_attention_heads: int. Number of attention heads in the Transformer. intermediate_size: int. The size of the "intermediate" (a.k.a., feed forward) layer. intermediate_act_fn: function. The non-linear activation function to apply to the output of the intermediate/feed-forward layer. hidden_dropout_prob: float. Dropout probability for the hidden layers. attention_probs_dropout_prob: float. Dropout probability of the attention probabilities. initializer_range: float. Range of the initializer (stddev of truncated normal). do_return_all_layers: Whether to also return all layers or just the final layer. Returns: float Tensor of shape [batch_size, seq_length, hidden_size], the final hidden layer of the Transformer. Raises: ValueError: A Tensor shape or parameter is invalid. """ if hidden_size % num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, num_attention_heads)) if not is_training: hidden_dropout_prob = 0.0 attention_probs_dropout_prob = 0.0 attention_head_size = int(hidden_size / num_attention_heads) input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] input_width = input_shape[2] # The Transformer performs sum residuals on all layers so the input needs # to be the same as the hidden size. if input_width != hidden_size: raise ValueError( "The width of the input tensor (%d) != hidden size (%d)" % (input_width, hidden_size)) # We keep the representation as a 2D tensor to avoid re-shaping it back and # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on # the GPU/CPU but may not be free on the TPU, so we want to minimize them to # help the optimizer. if use_position_embeddings: assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): full_position_embeddings = tf.get_variable( name='position_embeddings', shape=[max_position_embeddings, input_width], initializer=create_initializer(initializer_range)) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(input_tensor.shape.as_list()) # Only the last two dimensions are relevant (`seq_length` and `width`), so # we broadcast among the first dimensions, which is typically just # the batch size. position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, input_width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) input_tensor += position_embeddings prev_output = reshape_to_matrix(input_tensor) all_layer_outputs = [] for layer_idx in range(num_hidden_layers): with tf.variable_scope("layer_%d" % layer_idx): layer_input = prev_output with tf.variable_scope("attention"): attention_heads = [] with tf.variable_scope("self"): attention_head = attention_layer( from_tensor=layer_input, to_tensor=layer_input, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob= attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=True, batch_size=batch_size, from_seq_length=seq_length, to_seq_length=seq_length) attention_heads.append(attention_head) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: # In the case where we have other sequences, we just concatenate # them to the self-attention head before the projection. attention_output = tf.concat(attention_heads, axis=-1) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.variable_scope("output"): attention_output = tf.layers.dense( attention_output, hidden_size, kernel_initializer=create_initializer( initializer_range)) attention_output = dropout(attention_output, hidden_dropout_prob) attention_output = layer_norm(attention_output + layer_input) # The activation is only applied to the "intermediate" hidden layer. with tf.variable_scope("intermediate"): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=create_initializer(initializer_range)) # Down-project back to `hidden_size` then add the residual. with tf.variable_scope("output"): layer_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=create_initializer(initializer_range)) layer_output = dropout(layer_output, hidden_dropout_prob) layer_output = layer_norm(layer_output + attention_output) prev_output = layer_output all_layer_outputs.append(layer_output) if do_return_all_layers: final_outputs = [] for layer_output in all_layer_outputs: final_output = reshape_from_matrix(layer_output, input_shape) final_outputs.append(final_output) return final_outputs else: final_output = reshape_from_matrix(prev_output, input_shape) first_token_tensor = tf.squeeze(final_output[:, 0:1, :], axis=1) return first_token_tensor
def pgd_generate(x, model, eps=0.3,eps_iter=0.05, nb_iter=10, y=None, ord=np.inf, clip_min=None, clip_max=None, y_target=None, rand_init= True, rand_init_eps= 0.3, clip_grad=False, sanity_checks=True): """ Generate symbolic graph for adversarial examples and return. :param x: The model's symbolic inputs. :param kwargs: See `parse_params` """ asserts = [] # If a data range was specified, check that the input was in that range if clip_min is not None: asserts.append(tf.assert_greater_equal(x, tf.cast(clip_min, x.dtype))) if clip_max is not None: asserts.append(tf.assert_less_equal(x, tf.cast(clip_max, x.dtype))) # Initialize loop variables if rand_init: eta = random_lp_vector(tf.shape(x), ord, tf.cast(rand_init_eps, x.dtype), dtype=x.dtype) else: eta = tf.zeros(tf.shape(x)) # Clip eta eta = clip_eta(eta, ord, eps) adv_x = x + eta if clip_min is not None or clip_max is not None: adv_x = clip_by_value(adv_x, clip_min, clip_max) if y_target is not None: y = y_target targeted = True elif y is not None: y = y targeted = False else: model_preds = model(x) preds_max = tf.reduce_max(model_preds, 1, keepdims=True) y = tf.to_float(tf.equal(model_preds, preds_max)) y = tf.stop_gradient(y) targeted = False del model_preds # def cond(i, _): # """Iterate until requested number of iterations is completed""" # return tf.less(i, nb_iter) # # def body(i, adv_x): # """Do a projected gradient step""" # adv_x = fgsm_generate(adv_x, model, y=y, eps=eps, ord=ord, clip_min=clip_min, clip_max=clip_max, # clip_grad=clip_grad, targeted=targeted, sanity_checks=True) # # # Clipping perturbation eta to ord norm ball # eta = adv_x - x # eta = clip_eta(eta, ord, eps) # adv_x = x + eta # # # Redo the clipping. # # FGM already did it, but subtracting and re-adding eta can add some # # small numerical error. # if clip_min is not None or clip_max is not None: # adv_x = utils_tf.clip_by_value(adv_x, clip_min, clip_max) # # return i + 1, adv_x # # _, adv_x = tf.while_loop(cond, body, (tf.zeros([]), adv_x), back_prop=True, # maximum_iterations=nb_iter) for i in range(nb_iter): adv_x = fgsm_generate(adv_x, model, y=y, eps=eps_iter, ord=ord, clip_min=clip_min, clip_max=clip_max, clip_grad=clip_grad, targeted=targeted, sanity_checks=True) #Clipping perturbation eta to ord norm ball eta = adv_x - x eta = clip_eta(eta, ord, eps) adv_x = x + eta # Redo the clipping. # FGM already did it, but subtracting and re-adding eta can add some # small numerical error. if clip_min is not None or clip_max is not None: adv_x = clip_by_value(adv_x, clip_min, clip_max) common_dtype = tf.float32 asserts.append(tf.assert_less_equal(tf.cast(eps_iter, dtype=common_dtype), tf.cast(eps, dtype=common_dtype))) if ord == np.inf and clip_min is not None: asserts.append(tf.assert_less_equal(tf.cast(eps, x.dtype), 1e-6 + tf.cast(clip_max, x.dtype) - tf.cast(clip_min, x.dtype))) if sanity_checks: with tf.control_dependencies(asserts): adv_x = tf.identity(adv_x) return adv_x
def model_fn(features, labels, mode, params): is_training = mode == tf.estimator.ModeKeys.TRAIN # Inputs tokens = features[TEXT] # (N, L) token_lengths = features[SENTENCE_LENGTH] # (N,) sequence_mask = tf.sequence_mask(maxlen=tf.shape(tokens)[1], lengths=token_lengths) n = tf.shape(tokens)[0] length = params.flat_length with tf.control_dependencies([ tf.assert_greater_equal( length, token_lengths, message="Tokens longer than flat_length"), tf.assert_less_equal(tokens, tf.cast(vocab_size - 1, dtype=tokens.dtype), message="Tokens larger than vocab"), tf.assert_greater_equal(tokens, tf.cast(0, dtype=tokens.dtype), message="Tokens less than 0") ]): tokens = tf.identity(tokens) if params.l2 > 0: weights_regularizer = slim.l2_regularizer(params.l2) else: weights_regularizer = None with tf.variable_scope('autoencoder') as autoencoder_scope: # Encoder with tf.variable_scope('encoder'): mu, logsigma = encoder_flat( tokens=tokens, token_lengths=token_lengths, vocab_size=vocab_size, params=params, n=n, weights_regularizer=weights_regularizer) # Sampling latent_sample, latent_prior_sample = sampling_flat( mu=mu, logsigma=logsigma, params=params, n=n) # Decoder with tf.variable_scope('decoder', reuse=False) as decoder_scope: logits = decoder_flat(latent=latent_sample, vocab_size=vocab_size, params=params, weights_regularizer=weights_regularizer, n=n) if params.model_mode == ModelModes.AE: glogits = None else: with tf.variable_scope(decoder_scope, reuse=True): glogits = decoder_flat( latent=latent_prior_sample, vocab_size=vocab_size, params=params, weights_regularizer=weights_regularizer, n=n) if params.model_mode == ModelModes.AAE_RE or params.model_mode == ModelModes.AAE_STOCH: with tf.variable_scope('discriminator') as discriminator_scope: dis_inputs = tf.concat([latent_prior_sample, latent_sample], axis=0) dis_out = discriminator_output( x=dis_inputs, params=params, weights_regularizer=weights_regularizer, is_training=is_training) dis_out = tf.squeeze(dis_out, -1) print("Dis: {} -> {}".format(dis_inputs, dis_out)) build_gan_losses(params=params, autoencoder_scope=autoencoder_scope.name, discriminator_scope=discriminator_scope.name, dis_out=dis_out, n=n) discriminator_hook = dis_train_hook( discriminator_scope=discriminator_scope.name, params=params) training_hooks = [discriminator_hook] elif params.model_mode == ModelModes.VAE: training_hooks = [] elif params.model_mode == ModelModes.AE: training_hooks = [] else: raise ValueError() sequence_length_ctc = tf.tile([length], (n, )) return ctc_estimator(tokens=tokens, token_lengths=token_lengths, logits=logits, glogits=glogits, sequence_mask=sequence_mask, sequence_length_ctc=sequence_length_ctc, vocab=vocab, run_config=run_config, params=params, model_scope=autoencoder_scope.name, training_hooks=training_hooks, mode=mode)
def one_hots(offsets, name='one_hots'): with tf.name_scope(name) as scope: with tf.control_dependencies([tf.assert_less_equal(tf.abs(offsets), scale)]): result = tf.expand_dims(tf.one_hot(scale - offsets, kernel_size), 1, name=scope) assert_shape(result, [batch_size, 1, kernel_size]) return result
def bert_encoder(sequence, params): # extract sequence mask information seq_mask = 1. - tf.to_float(tf.equal(sequence, params.bert.vocab.pad)) # extract segment information seg_pos = tf.to_float(tf.equal(sequence, params.bert.vocab.sep)) seg_ids = tf.cumsum(seg_pos, axis=1, reverse=True) seg_num = tf.reduce_sum(seg_pos, axis=1, keepdims=True) seg_ids = seg_num - seg_ids seg_ids = tf.to_int32(seg_ids * seq_mask) # sequence length information seq_shp = util.shape_list(sequence) batch_size, seq_length = seq_shp[:2] def custom_getter(getter, name, *args, **kwargs): kwargs['trainable'] = params.tune_bert return getter(name, *args, **kwargs) with tf.variable_scope("bert", custom_getter=custom_getter): # handling sequence embeddings: token_embedding pls segment embedding pls positional embedding embed_initializer = tf.truncated_normal_initializer(stddev=params.bert.initializer_range) with tf.variable_scope("embeddings"): word_embedding = tf.get_variable( name="word_embeddings", shape=[params.bert.vocab.size, params.bert.hidden_size], initializer=embed_initializer ) seq_embed = tf.nn.embedding_lookup(word_embedding, sequence) segment_embedding = tf.get_variable( name="token_type_embeddings", shape=[2, params.bert.hidden_size], initializer=embed_initializer ) seg_embed = tf.nn.embedding_lookup(segment_embedding, seg_ids) # word embedding + segment embedding seq_embed = seq_embed + seg_embed # add position embedding assert_op = tf.assert_less_equal(seq_length, params.bert.max_position_embeddings) with tf.control_dependencies([assert_op]): position_embedding = tf.get_variable( name="position_embeddings", shape=[params.bert.max_position_embeddings, params.bert.hidden_size], initializer=embed_initializer ) pos_embed = position_embedding[:seq_length] seq_embed = seq_embed + tf.expand_dims(pos_embed, 0) # post-processing, layer norm and segmentation seq_embed = tc.layers.layer_norm( inputs=seq_embed, begin_norm_axis=-1, begin_params_axis=-1) seq_embed = util.valid_apply_dropout(seq_embed, params.bert.hidden_dropout_prob) bert_outputs = [] # handling sequence encoding with transformer encoder with tf.variable_scope("encoder"): attention_mask = encoder.create_attention_mask_from_input_mask( sequence, seq_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. all_encoder_layers = encoder.transformer_model( input_tensor=seq_embed, attention_mask=attention_mask, hidden_size=params.bert.hidden_size, num_hidden_layers=params.bert.num_hidden_layers, num_attention_heads=params.bert.num_attention_heads, intermediate_size=params.bert.intermediate_size, intermediate_act_fn=encoder.get_activation(params.bert.hidden_act), hidden_dropout_prob=params.bert.hidden_dropout_prob, attention_probs_dropout_prob=params.bert.attention_probs_dropout_prob, initializer_range=params.bert.initializer_range, do_return_all_layers=True) sequence_output = all_encoder_layers bert_outputs.append(sequence_output) if params.use_bert_single: # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(sequence_output[-1][:, 0:1, :], axis=1) pooled_output = tf.layers.dense( first_token_tensor, params.bert.hidden_size, activation=tf.tanh, kernel_initializer=embed_initializer) bert_outputs.append(pooled_output) return bert_outputs
def __init__(self, mean_direction, concentration, validate_args=False, allow_nan_stats=True, name='VonMisesFisher'): """Creates a new `VonMisesFisher` instance. Args: mean_direction: Floating-point `Tensor` with shape [B1, ... Bn, D]. A unit vector indicating the mode of the distribution, or the unit-normalized direction of the mean. (This is *not* in general the mean of the distribution; the mean is not generally in the support of the distribution.) NOTE: `D` is currently restricted to <= 5. concentration: Floating-point `Tensor` having batch shape [B1, ... Bn] broadcastable with `mean_direction`. The level of concentration of samples around the `mean_direction`. `concentration=0` indicates a uniform distribution over the unit hypersphere, and `concentration=+inf` indicates a `Deterministic` distribution (delta function) at `mean_direction`. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: ValueError: For known-bad arguments, i.e. unsupported event dimension. """ parameters = dict(locals()) with tf.name_scope(name, values=[mean_direction, concentration]) as name: assertions = [ tf.assert_non_negative( concentration, message='`concentration` must be non-negative'), tf.assert_greater( tf.shape(mean_direction)[-1], 1, message='`mean_direction` may not have scalar event shape' ), tf.assert_near(1., tf.linalg.norm(mean_direction, axis=-1), message='`mean_direction` must be unit-length') ] if validate_args else [] if mean_direction.shape.with_rank_at_least( 1)[-1].value is not None: if mean_direction.shape.with_rank_at_least(1)[-1].value > 5: raise ValueError( 'vMF ndims > 5 is not currently supported') elif validate_args: assertions += [ tf.assert_less_equal( tf.shape(mean_direction)[-1], 5, message='vMF ndims > 5 is not currently supported') ] with tf.control_dependencies(assertions): self._mean_direction = tf.convert_to_tensor( mean_direction, name='mean_direction') self._concentration = tf.convert_to_tensor( concentration, name='concentration') tf.assert_same_float_dtype( [self._mean_direction, self._concentration]) # mean_direction is always reparameterized. # concentration is only for event_dim==3, via an inversion sampler. reparameterization_type = ( tf.distributions.FULLY_REPARAMETERIZED if mean_direction.shape.with_rank_at_least(1)[-1].value == 3 else tf.distributions.NOT_REPARAMETERIZED) super(VonMisesFisher, self).__init__( dtype=self._concentration.dtype, validate_args=validate_args, allow_nan_stats=allow_nan_stats, reparameterization_type=reparameterization_type, parameters=parameters, graph_parents=[self._mean_direction, self._concentration], name=name)
def embedding_postprocessor(input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=16, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1): """Performs various post-processing on a word embedding tensor. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_token_type: bool. Whether to add embeddings for `token_type_ids`. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. Must be specified if `use_token_type` is True. token_type_vocab_size: int. The vocabulary size of `token_type_ids`. token_type_embedding_name: string. The name of the embedding table variable for token type ids. use_position_embeddings: bool. Whether to add position embeddings for the position of each token in the sequence. position_embedding_name: string. The name of the embedding table variable for positional embeddings. initializer_range: float. Range of the weight initialization. max_position_embeddings: int. Maximum sequence length that might ever be used with this model. This can be longer than the sequence length of input_tensor, but cannot be shorter. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float tensor with same shape as `input_tensor`. Raises: ValueError: One of the tensor shapes or input values is invalid. """ #此时input_tensor是三维,[batch_size, seq_length, embdding_size] input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor #如果使用token_type,先创建token的embedding table, 维度是[token_type_vocab_size, width] if use_token_type: if token_type_ids is None: raise ValueError("`token_type_ids` must be specified if" "`use_token_type` is True.") token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range)) # This vocab will be small so we always do one-hot here, since it is always # faster for a small vocabulary. #token embdding直接使用时one-hot方式进行查找其对应的embedding向量,token是指一句话可以分成几段,在分类任务中,token type是0 #在句子对任务中, token type是2,默认token_type最大是16 flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) #将token embedding的结果加在原来的embedding结果中 output += token_type_embeddings if use_position_embeddings: #确保长度不超过最大长度 assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) #不理解tf.control_dependencies的作用 with tf.control_dependencies([assert_op]): #创建位置编码的embedding table, [max_position_embeddings, width] full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range)) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. #虽然在上一步,创建了最大位置的embedding table,但在实际中,输入的最大序列长度不会超过bert润许的最大长度,为了高效计算,、 #我们从初始化的全位置词表中,将最大输入序列长度的位置取出来,即此时的position embedding维度是[seq_length, width] position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(output.shape.as_list()) # Only the last two dimensions are relevant (`seq_length` and `width`), so # we broadcast among the first dimensions, which is typically just # the batch size. #在输入中,最后两维是seq_legth和width position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append( 1) #除了后两维,前面的维度现保存起来,此时position_broadcast_shape存的是1 position_broadcast_shape.extend([seq_length, width]) #[1, seq_length, width] position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) #batch_size中的每一个[seq_length, width]都加上positon_embedding,此时的position_embedding是同一个 output += position_embeddings #运用layer norm和dropout output = layer_norm_and_dropout(output, dropout_prob) return output
def pix2pix_preprocess(images, num_preprocessing_layers=0, num_outputs=3, encoder_base_num_filters=32, is_training=True, reuse=False, is_chief=True, num_rows=5, batch_size=32, verbose=True, **kwargs): """Free-form transformation preprocessing. Args: x: 4D tensor of images num_preprocessing_layers: I negative, number of layers in the pix2pix encoder. num_outputs: Number of output channels for Pix2Pix encoder_base_num_filters: Base number of filters in the pix2pix encoder num_rows: required for summary (if is_chief is True) batch_size: required for summary (if is_chief is True) is_training: whether the model is in training mode reuse: whether to reuse the model. is_chief: determine whether to add summaries verbose: verbosity level kwargs: Unused keywords arguments """ assert num_preprocessing_layers <= 0 # No preprocessing if num_preprocessing_layers == 0: return images # Pix2pix else: # Pix2Pix take images in [-1, 1] with tf.control_dependencies([tf.assert_greater_equal(images, 0.)]): with tf.control_dependencies([tf.assert_less_equal(images, 1.)]): images = (images - 0.5) * 2 if is_chief: input_images = viz_utils.image_grid(images, num_rows=num_rows, batch_size=batch_size) # Pix2pix. Output in [-1, 1] encoder_blocks = [ encoder_base_num_filters * (2**i) for i in range(-num_preprocessing_layers) ] images = net_utils.pix2pix(images, encoder_blocks=encoder_blocks, num_outputs=num_outputs, is_training=is_training, reuse=reuse, is_chief=is_chief, verbose=verbose, **kwargs) # Image summaries if is_chief: images = tf.identity(images, name='projected_images') mode = ('train' if is_training else 'test') output_images = viz_utils.image_grid(images, num_rows=num_rows, batch_size=batch_size) # Tile if necessary if num_outputs == 1 and input_images.get_shape()[-1] == 3: output_images = tf.tile(output_images, (1, 1, 1, 3)) if input_images.get_shape()[-1] == 1 and num_outputs == 3: input_images = tf.tile(input_images, (1, 1, 1, 3)) # Grid summary_images = viz_utils.image_grid(tf.concat( [input_images, output_images], axis=0), num_rows=1, num_cols=2, batch_size=2) summary_images = tf.identity(summary_images, name='in_out_images') tf.summary.image('%s/in_out' % mode, summary_images, collections=[mode]) # Send output back to [0, 1] images = images / 2. + 0.5 return images
def __init__(self, learning_rate, preconditioner_decay_rate=0.95, num_pseudo_batches=1, burnin=25, diagonal_bias=1e-8, name=None, variable_scope=None): default_name = 'StochasticGradientLangevinDynamics' with tf.name_scope(name, default_name, [ learning_rate, preconditioner_decay_rate, num_pseudo_batches, burnin, diagonal_bias ]): if variable_scope is None: var_scope_name = tf.get_default_graph().unique_name( name or default_name) with tf.variable_scope(var_scope_name) as scope: self._variable_scope = scope else: self._variable_scope = variable_scope self._preconditioner_decay_rate = tf.convert_to_tensor( preconditioner_decay_rate, name='preconditioner_decay_rate') self._num_pseudo_batches = tf.convert_to_tensor( num_pseudo_batches, name='num_pseudo_batches') self._burnin = tf.convert_to_tensor(burnin, name='burnin') self._diagonal_bias = tf.convert_to_tensor(diagonal_bias, name='diagonal_bias') self._learning_rate = tf.convert_to_tensor(learning_rate, name='learning_rate') with tf.variable_scope(self._variable_scope): self._counter = tf.get_variable('counter', initializer=0, trainable=False) self._preconditioner_decay_rate = control_flow_ops.with_dependencies([ tf.assert_non_negative( self._preconditioner_decay_rate, message='`preconditioner_decay_rate` must be non-negative' ), tf.assert_less_equal( self._preconditioner_decay_rate, 1., message='`preconditioner_decay_rate` must be at most 1.'), ], self._preconditioner_decay_rate) self._num_pseudo_batches = control_flow_ops.with_dependencies([ tf.assert_greater( self._num_pseudo_batches, 0, message='`num_pseudo_batches` must be greater than zero') ], self._num_pseudo_batches) self._burnin = control_flow_ops.with_dependencies([ tf.assert_non_negative( self._burnin, message='`burnin` must be non-negative'), tf.assert_integer(self._burnin, message='`burnin` must be an integer') ], self._burnin) self._diagonal_bias = control_flow_ops.with_dependencies([ tf.assert_non_negative( self._diagonal_bias, message='`diagonal_bias` must be non-negative') ], self._diagonal_bias) super(StochasticGradientLangevinDynamics, self).__init__(use_locking=False, name=name or default_name)
def generate(self, x, **kwargs): """ Generate symbolic graph for adversarial examples and return. :param x: The model's symbolic inputs. :param eps: (optional float) maximum distortion of adversarial example compared to original input :param eps_iter: (optional float) step size for each attack iteration :param nb_iter: (optional int) Number of attack iterations. :param rand_init: (optional) Whether to use random initialization :param y: (optional) A tensor with the true class labels NOTE: do not use smoothed labels here :param y_target: (optional) A tensor with the labels to target. Leave y_target=None if y is also set. Labels should be one-hot-encoded. NOTE: do not use smoothed labels here :param ord: (optional) Order of the norm (mimics Numpy). Possible values: np.inf, 1 or 2. :param clip_min: (optional float) Minimum input component value :param clip_max: (optional float) Maximum input component value """ # Parse and save attack-specific parameters assert self.parse_params(**kwargs) # Initialize loop variables if self.rand_init: eta = tf.random_uniform(tf.shape(x), -self.rand_minmax, self.rand_minmax, dtype=self.tf_dtype) else: eta = tf.zeros(tf.shape(x)) eta = clip_eta(eta, self.ord, self.eps) # Fix labels to the first model predictions for loss computation model_preds = self.model.get_output(x) preds_max = reduce_max(model_preds, 1, keepdims=True) if self.y_target is not None: y = self.y_target targeted = True elif self.y is not None: y = self.y targeted = False else: y = tf.to_float(tf.equal(model_preds, preds_max)) y = tf.stop_gradient(y) targeted = False y_kwarg = 'y_target' if targeted else 'y' fgm_params = { 'eps': self.eps_iter, y_kwarg: y, 'ord': self.ord, 'clip_min': self.clip_min, 'clip_max': self.clip_max } # Use getattr() to avoid errors in eager execution attacks FGM = self.FGM_CLASS(self.model, sess=getattr(self, 'sess', None), dtypestr=self.dtypestr) def cond(i, _): return tf.less(i, self.nb_iter) def body(i, e): adv_x = FGM.generate(x + e, **fgm_params) # Clipping perturbation according to clip_min and clip_max if self.clip_min is not None and self.clip_max is not None: adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max) # Clipping perturbation eta to self.ord norm ball eta = adv_x - x eta = clip_eta(eta, self.ord, self.eps) return i + 1, eta _, eta = tf.while_loop(cond, body, [tf.zeros([]), eta], back_prop=True) # Define adversarial example (and clip if necessary) adv_x = x + eta if self.clip_min is not None or self.clip_max is not None: assert self.clip_min is not None and self.clip_max is not None adv_x = tf.clip_by_value(adv_x, self.clip_min, self.clip_max) asserts = [] # Asserts run only on CPU. # When multi-GPU eval code tries to force all PGD ops onto GPU, this # can cause an error. with tf.device("/CPU:0"): asserts.append(tf.assert_less_equal(self.eps_iter, self.eps)) if self.ord == np.inf and self.clip_min is not None: # The 1e-6 is needed to compensate for numerical error. # Without the 1e-6 this fails when e.g. eps=.2, clip_min=.5, clip_max=.7 asserts.append( tf.assert_less_equal(self.eps, 1e-6 + self.clip_max - self.clip_min)) if self.sanity_checks: with tf.control_dependencies(asserts): adv_x = tf.identity(adv_x) return adv_x
def embedding_postprocessor(input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=3, token_type_embedding_name='token_type_embeddings', use_positional_embeddings=True, positional_embedding_type='normal', pre_positional_embeddings=None, positional_embedding_name='position_embeddings', initializer_range=0.01, max_positional_embeddings=512, dropout_prob=0.01): """Performs some preprocessing on the word embeddings. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_token_type: bool. Whether to add segment embeddings, very confused about the original comments uses 'token' as name, as I realized, token_type_ids would be [[0, 0, 1], [0, 1, 0]], 0 refers to the segment 1, and 1 refers to segment 2, the last 0 in the second array refers to the padding. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_vocab_size: the number of token types. use_positional_embeddings: bool. Whether to add positional embeddings. positional_embedding_type: ['normal', 'trigonometrical']. pre_positional_embeddings: postional embeddings for the pre_positional_embeddings. postional_embedding_name: string. The name of the embedding table variable. initializer_range: float. Range of the weight initializer. max_positional_embeddings: int. Maximum sequence length for each sentence, which should be equal to or longer than the sequence. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float Tensor with the identical shape as 'input_tensor'. """ input_shape = get_shape_list(input_tensor, expected_rank=[2,3]) batch_size, seq_length, width = input_shape[0], input_shape[1], input_shape[2] # create this variable in case of not use any pre-embeddings on the input_tensor output = input_tensor if use_token_type: if token_type_ids is None: _error('`token_type_ids` must be specified if `use_token_type` is True.') raise ValueError token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range)) token_type_embeddings = tf.nn.embedding_lookup(token_type_table, token_type_ids) output += token_type_embeddings if use_positional_embeddings: assert_op = tf.assert_less_equal(seq_length, max_positional_embeddings) with tf.control_dependencies([assert_op]): full_positional_embeddings = tf.get_variable( name=positional_embedding_name, shape=[max_positional_embeddings, width], initializer=create_initializer(initializer_range)) # the full_positional_embeddings is created under the maximum sequence length, # however, the actual length maybe less than the maximum length, so slicing is necessary. positional_embeddings = tf.slice(full_positional_embeddings, [0, 0], [seq_length, -1]) output += positional_embeddings output = layer_norm_and_dropout(output, dropout_prob) return output
def embedding_postprocessor( input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=2, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=0.02, max_position_embeddings=512, ): """Performs various post-processing on a word embedding tensor. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_token_type: bool. Whether to add embeddings for `token_type_ids`. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. Must be specified if `use_token_type` is True. token_type_vocab_size: int. The vocabulary size of `token_type_ids`. token_type_embedding_name: string. The name of the embedding table variable for token type ids. use_position_embeddings: bool. Whether to add position embeddings for the position of each token in the sequence. position_embedding_name: string. The name of the embedding table variable for positional embeddings. initializer_range: float. Range of the weight initialization. max_position_embeddings: int. Maximum sequence length that might ever be used with this model. This can be longer than the sequence length of input_tensor, but cannot be shorter. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float tensor with same shape as `input_tensor`. Raises: ValueError: One of the tensor shapes or input values is invalid. """ input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor if use_token_type: if token_type_ids is None: raise ValueError( '`token_type_ids` must be specified if' '`use_token_type` is True.' ) token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range), ) flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot( flat_token_type_ids, depth=token_type_vocab_size ) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape( token_type_embeddings, [batch_size, seq_length, width] ) output += token_type_embeddings if use_position_embeddings: assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range), ) position_embeddings = tf.slice( full_position_embeddings, [0, 0], [seq_length, -1] ) num_dims = len(output.shape.as_list()) position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape( position_embeddings, position_broadcast_shape ) output += position_embeddings return output
def _sample_n(self, n, seed=None): seed = seed_stream.SeedStream(seed, salt='vom_mises_fisher') # The sampling strategy relies on the fact that vMF variates are symmetric # about the mean direction. Accordingly, if we have a sampling strategy for # the away-from-mean angle, then we can uniformly sample the remaining # dimensions on the S^{dim-2} sphere for , and rotate these samples from a # (1, 0, 0, ..., 0)-mode distribution into the target orientation. # # This is easy to imagine on the 1-sphere (S^1; in 2-D space): sample a # von-Mises distributed `x` value in [-1, 1], then uniformly select what # amounts to a "up" or "down" additional degree of freedom after unit # normalizing, followed by a final rotation to the desired mean direction # from a basis of (1, 0). # # On S^2 (in 3-D), selecting a vMF `x` identifies a circle in `yz` on the # unit sphere over which the distribution is uniform, in particular the # circle where x = \hat{x} intersects the unit sphere. We pick a point on # that circle, then rotate to the desired mean direction from a basis of # (1, 0, 0). event_dim = self.event_shape[0].value or self._event_shape_tensor()[0] sample_batch_shape = tf.concat([[n], self._batch_shape_tensor()], axis=0) dim = tf.cast(event_dim - 1, self.dtype) if event_dim == 3: samples_dim0 = self._sample_3d(n, seed=seed) else: # Wood'94 provides a rejection algorithm to sample the x coordinate. # Wood'94 definition of b: # b = (-2 * kappa + tf.sqrt(4 * kappa**2 + dim**2)) / dim # https://stats.stackexchange.com/questions/156729 suggests: b = dim / (2 * self.concentration + tf.sqrt(4 * self.concentration**2 + dim**2)) # TODO(bjp): Integrate any useful numerical tricks from hyperspherical VAE # https://github.com/nicola-decao/s-vae-tf/ x = (1 - b) / (1 + b) c = self.concentration * x + dim * tf.log1p(-x**2) beta = tf.distributions.Beta(dim / 2, dim / 2) def cond_fn(w, should_continue): del w return tf.reduce_any(should_continue) def body_fn(w, should_continue): z = beta.sample(sample_shape=sample_batch_shape, seed=seed()) w = tf.where(should_continue, (1 - (1 + b) * z) / (1 - (1 - b) * z), w) w = tf.check_numerics(w, 'w') should_continue = tf.logical_and( should_continue, self.concentration * w + dim * tf.log1p(-x * w) - c < tf.log( tf.random_uniform(sample_batch_shape, seed=seed(), dtype=self.dtype))) return w, should_continue w = tf.zeros(sample_batch_shape, dtype=self.dtype) should_continue = tf.ones(sample_batch_shape, dtype=tf.bool) samples_dim0 = tf.while_loop(cond_fn, body_fn, (w, should_continue))[0] samples_dim0 = samples_dim0[..., tf.newaxis] if not self._allow_nan_stats: # Verify samples are w/in -1, 1, with useful error output tensors (top # value rather than all values). with tf.control_dependencies([ tf.assert_less_equal( samples_dim0, self.dtype.as_numpy_dtype(1.01), data=[tf.nn.top_k(tf.reshape(samples_dim0, [-1]))[0]]), tf.assert_greater_equal( samples_dim0, self.dtype.as_numpy_dtype(-1.01), data=[ -tf.nn.top_k(tf.reshape(-samples_dim0, [-1]))[0] ]) ]): samples_dim0 = tf.identity(samples_dim0) samples_otherdims_shape = tf.concat( [sample_batch_shape, [event_dim - 1]], axis=0) unit_otherdims = tf.nn.l2_normalize(tf.random_normal( samples_otherdims_shape, seed=seed(), dtype=self.dtype), axis=-1) samples = tf.concat( [ samples_dim0, # we must avoid sqrt(1 - (>1)**2) tf.sqrt(tf.maximum(1 - samples_dim0**2, 0.)) * unit_otherdims ], axis=-1) samples = tf.nn.l2_normalize(samples, axis=-1) if not self._allow_nan_stats: samples = tf.check_numerics(samples, 'samples') # Runtime assert that samples are unit length. if not self._allow_nan_stats: worst, idx = tf.nn.top_k( tf.reshape(tf.abs(1 - tf.linalg.norm(samples, axis=-1)), [-1])) with tf.control_dependencies([ tf.assert_near(self.dtype.as_numpy_dtype(0), worst, data=[ worst, idx, tf.gather( tf.reshape(samples, [-1, event_dim]), idx) ], atol=1e-4, summarize=100) ]): samples = tf.identity(samples) # The samples generated are symmetric around a mode at (1, 0, 0, ...., 0). # Now, we move the mode to `self.mean_direction` using a rotation matrix. if not self._allow_nan_stats: # Assert that the basis vector rotates to the mean direction, as expected. basis = tf.cast( tf.concat([[1.], tf.zeros([event_dim - 1])], axis=0), self.dtype) with tf.control_dependencies([ tf.assert_less( tf.linalg.norm(self._rotate(basis) - self.mean_direction, axis=-1), self.dtype.as_numpy_dtype(1e-5)) ]): return self._rotate(samples) return self._rotate(samples)
def embedding_postprocessor(input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=16, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1): # 对一个word embedding执行各种后处理; # Args: # input_tensor: float类型的Tensor; # use_token_type: bool类型, 是否对`token_type_ids`添加embeddings; # token_type_ids: [可选] int32类型Tensor; # token_type_vocab_size: int类型, `token_type_ids`词汇表的size; # token_type_embedding_name: String. token-type-ids的embedding-table name; # use_position_embeddings: bool, 是否为序列中每个token的位置添加位置embeddings; # position_embedding_name: String, Position_Embedding name; # initializer_range: float类型, 权重初始化的范围; # max_position_embeddings: int类型, 最大序列长度, 可以比input_tensor长, 但不能比他短; # dropout_prob: float, Dropout最终的大小 # Returns: # 返回与"input_tensor"形状相同的Tensor; input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor if use_token_type: if token_type_ids is None: raise ValueError("如果`use_token_type`是True, `token_type_ids`必须被赋值.") token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range)) # 由于这个vocab很小, 所以我们在这里使用one-hot, 对于小词汇量来说, one-hot更快; flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings if use_position_embeddings: assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range)) # 由于Position embedding table 是一个学习变量, # 我们使用(长)序列长度为"max_position_embeddings"创建它. # 实际的序列长度可能比这个要短, 以便更快地训练序列不长的任务. # 因此'full_position_embeddings'实际上是一个[0,1,2,…, max_position_embeddings-1], # 当前序列为[0,1,2,…seq_length-1],因此我们可以执行一个切片. position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(output.shape.as_list()) # 只有最后两个维度和(`seq_length`和`width`相关.), 所以我们在第一个维度中进行了广播, # 一般来说只是batch_size; position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) output += position_embeddings output = layer_norm_and_dropout(output, dropout_prob) return output pass
def percentile(x, q, axis=None, interpolation=None, keep_dims=False, validate_args=False, name=None): """Compute the `q`-th percentile(s) of `x`. Given a vector `x`, the `q`-th percentile of `x` is the value `q / 100` of the way from the minimum to the maximum in a sorted copy of `x`. The values and distances of the two nearest neighbors as well as the `interpolation` parameter will determine the percentile if the normalized ranking does not match the location of `q` exactly. This function is the same as the median if `q = 50`, the same as the minimum if `q = 0` and the same as the maximum if `q = 100`. Multiple percentiles can be computed at once by using `1-D` vector `q`. Dimension zero of the returned `Tensor` will index the different percentiles. ```python # Get 30th percentile with default ('nearest') interpolation. x = [1., 2., 3., 4.] percentile(x, q=30.) ==> 2.0 # Get 30th and 70th percentiles with 'lower' interpolation x = [1., 2., 3., 4.] percentile(x, q=[30., 70.], interpolation='lower') ==> [1., 3.] # Get 100th percentile (maximum). By default, this is computed over every dim x = [[1., 2.] [3., 4.]] percentile(x, q=100.) ==> 4. # Treat the leading dim as indexing samples, and find the 100th quantile (max) # over all such samples. x = [[1., 2.] [3., 4.]] percentile(x, q=100., axis=[0]) ==> [3., 4.] ``` Compare to `numpy.percentile`. Args: x: Floating point `N-D` `Tensor` with `N > 0`. If `axis` is not `None`, `x` must have statically known number of dimensions. q: Scalar or vector `Tensor` with values in `[0, 100]`. The percentile(s). axis: Optional `0-D` or `1-D` integer `Tensor` with constant values. The axis that hold independent samples over which to return the desired percentile. If `None` (the default), treat every dimension as a sample dimension, returning a scalar. interpolation : {'lower', 'higher', 'nearest'}. Default: 'nearest' This optional parameter specifies the interpolation method to use when the desired quantile lies between two data points `i < j`: * lower: `i`. * higher: `j`. * nearest: `i` or `j`, whichever is nearest. keep_dims: Python `bool`. If `True`, the last dimension is kept with size 1 If `False`, the last dimension is removed from the output shape. validate_args: Whether to add runtime checks of argument validity. If False, and arguments are incorrect, correct behavior is not guaranteed. name: A Python string name to give this `Op`. Default is 'percentile' Returns: A `(rank(q) + N - len(axis))` dimensional `Tensor` of same dtype as `x`, or, if `axis` is `None`, a `rank(q)` `Tensor`. The first `rank(q)` dimensions index quantiles for different values of `q`. Raises: ValueError: If argument 'interpolation' is not an allowed type. """ name = name or 'percentile' allowed_interpolations = {'lower', 'higher', 'nearest'} if interpolation is None: interpolation = 'nearest' else: if interpolation not in allowed_interpolations: raise ValueError( 'Argument `interpolation` must be in %s. Found %s' % (allowed_interpolations, interpolation)) with tf.name_scope(name, values=[x, q]): x = tf.convert_to_tensor(x, name='x') # Double is needed here and below, else we get the wrong index if the array # is huge along axis. q = tf.to_double(q, name='q') _get_static_ndims(q, expect_ndims_no_more_than=1) if validate_args: q = control_flow_ops.with_dependencies([ tf.assert_rank_in(q, [0, 1]), tf.assert_greater_equal(q, tf.to_double(0.)), tf.assert_less_equal(q, tf.to_double(100.)) ], q) if axis is None: y = tf.reshape(x, [-1]) else: axis = tf.convert_to_tensor(axis, name='axis') tf.assert_integer(axis) axis_ndims = _get_static_ndims(axis, expect_static=True, expect_ndims_no_more_than=1) axis_const = tensor_util.constant_value(axis) if axis_const is None: raise ValueError( 'Expected argument `axis` to be statically available. Found: %s' % axis) axis = axis_const if axis_ndims == 0: axis = [axis] axis = [int(a) for a in axis] x_ndims = _get_static_ndims(x, expect_static=True, expect_ndims_at_least=1) axis = _make_static_axis_non_negative(axis, x_ndims) # Move dims in axis to the end, since _sort_tensor, which calls top_k, # only sorts the last dim. y = _move_dims_to_flat_end(x, axis, x_ndims) frac_at_q_or_above = 1. - q / 100. d = tf.to_double(tf.shape(y)[-1]) if interpolation == 'lower': indices = tf.ceil((d - 1) * frac_at_q_or_above) elif interpolation == 'higher': indices = tf.floor((d - 1) * frac_at_q_or_above) elif interpolation == 'nearest': indices = tf.round((d - 1) * frac_at_q_or_above) # If d is gigantic, then we would have d == d - 1, even in double... So # let's use max/min to avoid out of bounds errors. d = tf.shape(y)[-1] # d - 1 will be distinct from d in int32. indices = tf.clip_by_value(tf.to_int32(indices), 0, d - 1) # Sort everything, not just the top 'k' entries, which allows multiple calls # to sort only once (under the hood) and use CSE. sorted_y = _sort_tensor(y) # Gather the indices along the sorted (last) dimension. # If q is a vector, the last dim of gathered_y indexes different q[i]. gathered_y = tf.gather(sorted_y, indices, axis=-1) if keep_dims: if axis is None: ones_vec = tf.ones(shape=[ _get_best_effort_ndims(x) + _get_best_effort_ndims(q) ], dtype=tf.int32) gathered_y *= tf.ones(ones_vec, dtype=x.dtype) else: gathered_y = _insert_back_keep_dims(gathered_y, axis) # If q is a scalar, then result has the right shape. # If q is a vector, then result has trailing dim of shape q.shape, which # needs to be rotated to dim 0. return util.rotate_transpose(gathered_y, tf.rank(q))
def check_range(tensor, low, high, message_prefix=''): low = tf.assert_greater_equal(tensor, low, message=message_prefix + '>=') high = tf.assert_less_equal(tensor, high, message=message_prefix + '<=') with tf.control_dependencies([low, high]): return tf.identity(tensor)
def tower(inputs, is_training, dropout_probability, input_noise, normalize_input, flip_horizontally, translate, num_logits, is_initialization=False, name=None): with tf.name_scope(name, "tower"): default_conv_args = dict(padding='SAME', kernel_size=[3, 3], activation_fn=nn.lrelu, init=is_initialization) training_mode_funcs = [ nn.random_translate, nn.flip_randomly, nn.gaussian_noise, slim.dropout, wn.fully_connected, wn.conv2d ] training_args = dict(is_training=is_training) with \ slim.arg_scope([wn.conv2d], **default_conv_args), \ slim.arg_scope(training_mode_funcs, **training_args): #pylint: disable=no-value-for-parameter net = inputs assert_shape(net, [None, 32, 32, 3]) net = tf.cond( normalize_input, lambda: slim.layer_norm( net, scale=False, center=False, scope='normalize_inputs'), lambda: net) assert_shape(net, [None, 32, 32, 3]) net = nn.flip_randomly(net, horizontally=flip_horizontally, vertically=False, name='random_flip') net = tf.cond( translate, lambda: nn.random_translate( net, scale=2, name='random_translate'), lambda: net) net = nn.gaussian_noise(net, scale=input_noise, name='gaussian_noise') net = wn.conv2d(net, 128, scope="conv_1_1") net = wn.conv2d(net, 128, scope="conv_1_2") net = wn.conv2d(net, 128, scope="conv_1_3") net = slim.max_pool2d(net, [2, 2], scope='max_pool_1') net = slim.dropout(net, 1 - dropout_probability, scope='dropout_probability_1') assert_shape(net, [None, 16, 16, 128]) net = wn.conv2d(net, 256, scope="conv_2_1") net = wn.conv2d(net, 256, scope="conv_2_2") net = wn.conv2d(net, 256, scope="conv_2_3") net = slim.max_pool2d(net, [2, 2], scope='max_pool_2') net = slim.dropout(net, 1 - dropout_probability, scope='dropout_probability_2') assert_shape(net, [None, 8, 8, 256]) net = wn.conv2d(net, 512, padding='VALID', scope="conv_3_1") assert_shape(net, [None, 6, 6, 512]) net = wn.conv2d(net, 256, kernel_size=[1, 1], scope="conv_3_2") net = wn.conv2d(net, 128, kernel_size=[1, 1], scope="conv_3_3") net = slim.avg_pool2d(net, [6, 6], scope='avg_pool') assert_shape(net, [None, 1, 1, 128]) net = slim.flatten(net) assert_shape(net, [None, 128]) primary_logits = wn.fully_connected(net, 10, init=is_initialization) secondary_logits = wn.fully_connected(net, 10, init=is_initialization) with tf.control_dependencies([ tf.assert_greater_equal(num_logits, 1), tf.assert_less_equal(num_logits, 2) ]): secondary_logits = tf.case([ (tf.equal(num_logits, 1), lambda: primary_logits), (tf.equal(num_logits, 2), lambda: secondary_logits), ], exclusive=True, default=lambda: primary_logits) assert_shape(primary_logits, [None, 10]) assert_shape(secondary_logits, [None, 10]) return primary_logits, secondary_logits
def embedding_postprocessor(input_tensor, use_position_embeddings=True, position1_ids=None, position2_ids=None, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1): """Performs various post-processing on a word embedding tensor. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_position_embeddings: bool. Whether to add position embeddings for the position of each token in the sequence. position_embedding_name: string. The name of the embedding table variable for positional embeddings. initializer_range: float. Range of the weight initialization. max_position_embeddings: int. Maximum sequence length that might ever be used with this model. This can be longer than the sequence length of input_tensor, but cannot be shorter. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float tensor with same shape as `input_tensor`. Raises: ValueError: One of the tensor shapes or input values is invalid. """ input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] embedding_size = input_shape[2] output = input_tensor if use_position_embeddings: assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings * 2, embedding_size], initializer=create_initializer(initializer_range)) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. # position_embeddings = tf.slice(full_position_embeddings, [0, 0], # [seq_length, -1]) if position1_ids == None or position2_ids == None: ValueError('You need input the position information.') flat_position1_ids = tf.reshape(position1_ids, shape=[-1]) flat_position2_ids = tf.reshape(position2_ids, shape=[-1]) position1_embeddings = tf.nn.embedding_lookup( full_position_embeddings, flat_position1_ids) position2_embeddings = tf.nn.embedding_lookup( full_position_embeddings, flat_position2_ids) # num_dims = len(output.shape.as_list()) position_embeddings = position1_embeddings + position2_embeddings position_embeddings = tf.reshape( position_embeddings, shape=[batch_size, seq_length, embedding_size]) # Only the last two dimensions are relevant (`seq_length` and `width`), so # we broadcast among the first dimensions, which is typically just # the batch size. # position_broadcast_shape = [] # for _ in range(num_dims - 2): # position_broadcast_shape.append(1) # position_broadcast_shape.extend([seq_length, embedding_size]) # position_embeddings = tf.reshape(position_embeddings, # position_broadcast_shape) output += position_embeddings output = layer_norm_and_dropout(output, dropout_prob) return output
def preprocess_data(sequence_id, sequence, audio, velocity_range, hparams, is_training): """Compute spectral representation, labels, and length from sequence/audio. Args: sequence_id: id of the sequence. sequence: String tensor containing serialized NoteSequence proto. audio: String tensor containing containing WAV data. velocity_range: String tensor containing max and min velocities of file as a serialized VelocityRange. hparams: HParams object specifying hyperparameters. is_training: Whether or not this is a training run. Returns: An InputTensors tuple. Raises: ValueError: If hparams is contains an invalid spec_type. """ wav_jitter_amount_ms = label_jitter_amount_ms = 0 # if there is combined jitter, we must generate it once here if is_training and hparams.jitter_amount_ms > 0: wav_jitter_amount_ms = np.random.choice(hparams.jitter_amount_ms, size=1) label_jitter_amount_ms = wav_jitter_amount_ms if label_jitter_amount_ms > 0: sequence = jitter_label_op(sequence, label_jitter_amount_ms / 1000.) # possibly shift the entire sequence backward for better forward only training if hparams.backward_shift_amount_ms > 0: sequence = jitter_label_op(sequence, hparams.backward_shift_amount_ms / 1000.) if is_training: audio = transform_wav_data_op(audio, hparams=hparams, jitter_amount_sec=wav_jitter_amount_ms / 1000.) spec = wav_to_spec_op(audio, hparams=hparams) labels, label_weights, onsets, offsets, velocities = sequence_to_pianoroll_op( sequence, velocity_range, hparams=hparams) length = wav_to_num_frames_op(audio, hparams_frames_per_second(hparams)) asserts = [] if hparams.max_expected_train_example_len and is_training: asserts.append( tf.assert_less_equal(length, hparams.max_expected_train_example_len)) with tf.control_dependencies(asserts): return InputTensors(spec=spec, labels=labels, label_weights=label_weights, length=length, onsets=onsets, offsets=offsets, velocities=velocities, sequence_id=sequence_id, note_sequence=sequence)
def embedding_postprocessor(input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=2, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1): input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor if use_token_type: if token_type_ids is None: raise ValueError("`token_type_ids` must be specified if" "`use_token_type` is True.") token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range)) # This vocab will be small so we always do one-hot here, since it is always # faster for a small vocabulary. flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot( flat_token_type_ids, depth=token_type_vocab_size) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings if use_position_embeddings: assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range)) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(output.shape.as_list()) # Only the last two dimensions are relevant (`seq_length` and `width`), so # we broadcast among the first dimensions, which is typically just # the batch size. position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) # position_embeddings : [1,seq_length,width] position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) # output : [batch_size, seq_length, width], broadcast position_embeddings output += position_embeddings output = layer_norm_and_dropout(output, dropout_prob) return output
def fgsm_generate(x, model, y=None, eps=0.3, ord=np.inf, clip_min=None, clip_max=None, clip_grad=False, targeted=False, sanity_checks=True): asserts = [] # If a data range was specified, check that the input was in that range if clip_min is not None: asserts.append(tf.assert_greater_equal(x, tf.cast(clip_min, x.dtype))) if clip_max is not None: asserts.append(tf.assert_less_equal(x, tf.cast(clip_max, x.dtype))) logits = model(x)._op.inputs[0] if y is None: # Using model predictions as ground truth to avoid label leaking preds_max = reduce_max(logits, 1, keepdims=True) y = tf.to_float(tf.equal(logits, preds_max)) y = tf.stop_gradient(y) y = y / reduce_sum(y, 1, keepdims=True) # Compute loss ################# ## CE-loss ### ################# loss = softmax_cross_entropy_with_logits(labels=y, logits=logits) if targeted: loss = -loss # ################## # ### CW-loss ### # ################## # logits_sort = tf.contrib.framework.sort(logits, axis=1, direction="DESCENDING") # logits_max = tf.gather(logits_sort, axis=1, indices=[0]) # logits_secondmax = tf.gather(logits_sort, axis=1, indices=[1]) # # logits_loss = logits_max - logits_secondmax # loss = -tf.reduce_mean(logits_loss) # if targeted: # loss = -loss # ################## # ### DLR-loss ### # ################## # logits_sort = tf.contrib.framework.sort(logits, axis=1, direction="DESCENDING") # logits_max = tf.gather(logits_sort, axis=1, indices=[0]) # logits_secondmax = tf.gather(logits_sort, axis=1, indices=[1]) # logits_thirdmax = tf.gather(logits_sort, axis=1, indices=[2]) # # logits_loss = tf.divide(logits_max - logits_secondmax, logits_max - logits_thirdmax + 1e12) # # loss = -tf.reduce_mean(logits_loss) # if targeted: # loss = -loss # Define gradient of loss wrt input grad, = tf.gradients(loss, x) if clip_grad: grad = zero_out_clipped_grads(grad, x, clip_min, clip_max) optimal_perturbation = optimize_linear(grad, eps, ord) # Add perturbation to original example to obtain adversarial example adv_x = x + optimal_perturbation # If clipping is needed, reset all values outside of [clip_min, clip_max] if (clip_min is not None) or (clip_max is not None): # We don't currently support one-sided clipping assert clip_min is not None and clip_max is not None adv_x = clip_by_value(adv_x, clip_min, clip_max) if sanity_checks: with tf.control_dependencies(asserts): adv_x = tf.identity(adv_x) return adv_x
def __init__(self, mean_direction, concentration, validate_args=False, allow_nan_stats=True, name='VonMisesFisher'): """Creates a new `VonMisesFisher` instance. Args: mean_direction: Floating-point `Tensor` with shape [B1, ... Bn, D]. A unit vector indicating the mode of the distribution, or the unit-normalized direction of the mean. (This is *not* in general the mean of the distribution; the mean is not generally in the support of the distribution.) NOTE: `D` is currently restricted to <= 5. concentration: Floating-point `Tensor` having batch shape [B1, ... Bn] broadcastable with `mean_direction`. The level of concentration of samples around the `mean_direction`. `concentration=0` indicates a uniform distribution over the unit hypersphere, and `concentration=+inf` indicates a `Deterministic` distribution (delta function) at `mean_direction`. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: ValueError: For known-bad arguments, i.e. unsupported event dimension. """ parameters = dict(locals()) with tf.name_scope(name, values=[mean_direction, concentration]) as name: dtype = dtype_util.common_dtype([mean_direction, concentration], tf.float32) mean_direction = tf.convert_to_tensor( mean_direction, name='mean_direction', dtype=dtype) concentration = tf.convert_to_tensor( concentration, name='concentration', dtype=dtype) assertions = [ tf.assert_non_negative( concentration, message='`concentration` must be non-negative'), tf.assert_greater( tf.shape(mean_direction)[-1], 1, message='`mean_direction` may not have scalar event shape'), tf.assert_near( 1., tf.linalg.norm(mean_direction, axis=-1), message='`mean_direction` must be unit-length') ] if validate_args else [] if mean_direction.shape.with_rank_at_least(1)[-1].value is not None: if mean_direction.shape.with_rank_at_least(1)[-1].value > 5: raise ValueError('vMF ndims > 5 is not currently supported') elif validate_args: assertions += [tf.assert_less_equal( tf.shape(mean_direction)[-1], 5, message='vMF ndims > 5 is not currently supported')] with tf.control_dependencies(assertions): self._mean_direction = tf.identity(mean_direction) self._concentration = tf.identity(concentration) tf.assert_same_float_dtype([self._mean_direction, self._concentration]) # mean_direction is always reparameterized. # concentration is only for event_dim==3, via an inversion sampler. reparameterization_type = ( reparameterization.FULLY_REPARAMETERIZED if mean_direction.shape.with_rank_at_least(1)[-1].value == 3 else reparameterization.NOT_REPARAMETERIZED) super(VonMisesFisher, self).__init__( dtype=self._concentration.dtype, validate_args=validate_args, allow_nan_stats=allow_nan_stats, reparameterization_type=reparameterization_type, parameters=parameters, graph_parents=[self._mean_direction, self._concentration], name=name)
def _sample_n(self, n, seed=None): seed = seed_stream.SeedStream(seed, salt='vom_mises_fisher') # The sampling strategy relies on the fact that vMF variates are symmetric # about the mean direction. Accordingly, if we have a sampling strategy for # the away-from-mean angle, then we can uniformly sample the remaining # dimensions on the S^{dim-2} sphere for , and rotate these samples from a # (1, 0, 0, ..., 0)-mode distribution into the target orientation. # # This is easy to imagine on the 1-sphere (S^1; in 2-D space): sample a # von-Mises distributed `x` value in [-1, 1], then uniformly select what # amounts to a "up" or "down" additional degree of freedom after unit # normalizing, followed by a final rotation to the desired mean direction # from a basis of (1, 0). # # On S^2 (in 3-D), selecting a vMF `x` identifies a circle in `yz` on the # unit sphere over which the distribution is uniform, in particular the # circle where x = \hat{x} intersects the unit sphere. We pick a point on # that circle, then rotate to the desired mean direction from a basis of # (1, 0, 0). event_dim = self.event_shape[0].value or self._event_shape_tensor()[0] sample_batch_shape = tf.concat([[n], self._batch_shape_tensor()], axis=0) dim = tf.cast(event_dim - 1, self.dtype) if event_dim == 3: samples_dim0 = self._sample_3d(n, seed=seed) else: # Wood'94 provides a rejection algorithm to sample the x coordinate. # Wood'94 definition of b: # b = (-2 * kappa + tf.sqrt(4 * kappa**2 + dim**2)) / dim # https://stats.stackexchange.com/questions/156729 suggests: b = dim / (2 * self.concentration + tf.sqrt(4 * self.concentration**2 + dim**2)) # TODO(bjp): Integrate any useful numerical tricks from hyperspherical VAE # https://github.com/nicola-decao/s-vae-tf/ x = (1 - b) / (1 + b) c = self.concentration * x + dim * tf.log1p(-x**2) beta = beta_lib.Beta(dim / 2, dim / 2) def cond_fn(w, should_continue): del w return tf.reduce_any(should_continue) def body_fn(w, should_continue): z = beta.sample(sample_shape=sample_batch_shape, seed=seed()) w = tf.where(should_continue, (1 - (1 + b) * z) / (1 - (1 - b) * z), w) w = tf.check_numerics(w, 'w') should_continue = tf.logical_and( should_continue, self.concentration * w + dim * tf.log1p(-x * w) - c < tf.log(tf.random_uniform(sample_batch_shape, seed=seed(), dtype=self.dtype))) return w, should_continue w = tf.zeros(sample_batch_shape, dtype=self.dtype) should_continue = tf.ones(sample_batch_shape, dtype=tf.bool) samples_dim0 = tf.while_loop(cond_fn, body_fn, (w, should_continue))[0] samples_dim0 = samples_dim0[..., tf.newaxis] if not self._allow_nan_stats: # Verify samples are w/in -1, 1, with useful error output tensors (top # value rather than all values). with tf.control_dependencies([ tf.assert_less_equal( samples_dim0, self.dtype.as_numpy_dtype(1.01), data=[tf.nn.top_k(tf.reshape(samples_dim0, [-1]))[0]]), tf.assert_greater_equal( samples_dim0, self.dtype.as_numpy_dtype(-1.01), data=[-tf.nn.top_k(tf.reshape(-samples_dim0, [-1]))[0]])]): samples_dim0 = tf.identity(samples_dim0) samples_otherdims_shape = tf.concat([sample_batch_shape, [event_dim - 1]], axis=0) unit_otherdims = tf.nn.l2_normalize( tf.random_normal(samples_otherdims_shape, seed=seed(), dtype=self.dtype), axis=-1) samples = tf.concat([ samples_dim0, # we must avoid sqrt(1 - (>1)**2) tf.sqrt(tf.maximum(1 - samples_dim0**2, 0.)) * unit_otherdims ], axis=-1) samples = tf.nn.l2_normalize(samples, axis=-1) if not self._allow_nan_stats: samples = tf.check_numerics(samples, 'samples') # Runtime assert that samples are unit length. if not self._allow_nan_stats: worst, idx = tf.nn.top_k( tf.reshape(tf.abs(1 - tf.linalg.norm(samples, axis=-1)), [-1])) with tf.control_dependencies([ tf.assert_near( self.dtype.as_numpy_dtype(0), worst, data=[worst, idx, tf.gather(tf.reshape(samples, [-1, event_dim]), idx)], atol=1e-4, summarize=100)]): samples = tf.identity(samples) # The samples generated are symmetric around a mode at (1, 0, 0, ...., 0). # Now, we move the mode to `self.mean_direction` using a rotation matrix. if not self._allow_nan_stats: # Assert that the basis vector rotates to the mean direction, as expected. basis = tf.cast(tf.concat([[1.], tf.zeros([event_dim - 1])], axis=0), self.dtype) with tf.control_dependencies([ tf.assert_less( tf.linalg.norm(self._rotate(basis) - self.mean_direction, axis=-1), self.dtype.as_numpy_dtype(1e-5)) ]): return self._rotate(samples) return self._rotate(samples)
def __init__(self, batch_size, total_num_examples, max_learning_rate=1., preconditioner_decay_rate=0.95, burnin=25, burnin_max_learning_rate=1e-6, use_single_learning_rate=False, name=None, variable_scope=None): default_name = 'VariationalSGD' with tf.name_scope(name, default_name, [ max_learning_rate, preconditioner_decay_rate, batch_size, burnin, burnin_max_learning_rate ]): if variable_scope is None: var_scope_name = tf.get_default_graph().unique_name( name or default_name) with tf.variable_scope(var_scope_name) as scope: self._variable_scope = scope else: self._variable_scope = variable_scope self._preconditioner_decay_rate = tf.convert_to_tensor( preconditioner_decay_rate, name='preconditioner_decay_rate') self._batch_size = tf.convert_to_tensor(batch_size, name='batch_size') self._total_num_examples = tf.convert_to_tensor( total_num_examples, name='total_num_examples') self._burnin = tf.convert_to_tensor(burnin, name='burnin') self._burnin_max_learning_rate = tf.convert_to_tensor( burnin_max_learning_rate, name='burnin_max_learning_rate') self._max_learning_rate = tf.convert_to_tensor( max_learning_rate, name='max_learning_rate') self._use_single_learning_rate = use_single_learning_rate with tf.variable_scope(self._variable_scope): self._counter = tf.get_variable('counter', initializer=0, trainable=False) self._preconditioner_decay_rate = control_flow_ops.with_dependencies([ tf.assert_non_negative( self._preconditioner_decay_rate, message='`preconditioner_decay_rate` must be non-negative' ), tf.assert_less_equal( self._preconditioner_decay_rate, 1., message='`preconditioner_decay_rate` must be at most 1.'), ], self._preconditioner_decay_rate) self._batch_size = control_flow_ops.with_dependencies([ tf.assert_greater( self._batch_size, 0, message='`batch_size` must be greater than zero') ], self._batch_size) self._total_num_examples = control_flow_ops.with_dependencies([ tf.assert_greater( self._total_num_examples, 0, message='`total_num_examples` must be greater than zero') ], self._total_num_examples) self._burnin = control_flow_ops.with_dependencies([ tf.assert_non_negative( self._burnin, message='`burnin` must be non-negative'), tf.assert_integer(self._burnin, message='`burnin` must be an integer') ], self._burnin) self._burnin_max_learning_rate = control_flow_ops.with_dependencies([ tf.assert_non_negative( self._burnin_max_learning_rate, message='`burnin_max_learning_rate` must be non-negative') ], self._burnin_max_learning_rate) self._max_learning_rate = control_flow_ops.with_dependencies([ tf.assert_non_negative( self._max_learning_rate, message='`max_learning_rate` must be non-negative') ], self._max_learning_rate) super(VariationalSGD, self).__init__(use_locking=False, name=name or default_name)
def __init__(self, df, scale_operator, input_output_cholesky=False, validate_args=False, allow_nan_stats=True, name=None): """Construct Wishart distributions. Args: df: `float` or `double` tensor, the degrees of freedom of the distribution(s). `df` must be greater than or equal to `k`. scale_operator: `float` or `double` instance of `LinearOperator`. input_output_cholesky: Python `bool`. If `True`, functions whose input or output have the semantics of samples assume inputs are in Cholesky form and return outputs in Cholesky form. In particular, if this flag is `True`, input to `log_prob` is presumed of Cholesky form and output from `sample`, `mean`, and `mode` are of Cholesky form. Setting this argument to `True` is purely a computational optimization and does not change the underlying distribution; for instance, `mean` returns the Cholesky of the mean, not the mean of Cholesky factors. The `variance` and `stddev` methods are unaffected by this flag. Default value: `False` (i.e., input/output does not have Cholesky semantics). validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: TypeError: if scale is not floating-type TypeError: if scale.dtype != df.dtype ValueError: if df < k, where scale operator event shape is `(k, k)` """ parameters = dict(locals()) self._input_output_cholesky = input_output_cholesky with tf.name_scope(name) as name: with tf.name_scope("init", values=[df, scale_operator]): if not scale_operator.dtype.is_floating: raise TypeError( "scale_operator.dtype=%s is not a floating-point type" % scale_operator.dtype) if not scale_operator.is_square: print(scale_operator.to_dense().eval()) raise ValueError("scale_operator must be square.") self._scale_operator = scale_operator self._df = tf.convert_to_tensor(df, dtype=scale_operator.dtype, name="df") contrib_tensor_util.assert_same_float_dtype( (self._df, self._scale_operator)) if (self._scale_operator.shape.ndims is None or self._scale_operator.shape[-1].value is None): self._dimension = tf.cast( self._scale_operator.domain_dimension_tensor(), dtype=self._scale_operator.dtype, name="dimension") else: self._dimension = tf.convert_to_tensor( self._scale_operator.shape[-1].value, dtype=self._scale_operator.dtype, name="dimension") df_val = tensor_util.constant_value(self._df) dim_val = tensor_util.constant_value(self._dimension) if df_val is not None and dim_val is not None: df_val = np.asarray(df_val) if not df_val.shape: df_val = [df_val] if any(df_val < dim_val): raise ValueError( "Degrees of freedom (df = %s) cannot be less than " "dimension of scale matrix (scale.dimension = %s)" % (df_val, dim_val)) elif validate_args: assertions = tf.assert_less_equal( self._dimension, self._df, message=("Degrees of freedom (df = %s) cannot be " "less than dimension of scale matrix " "(scale.dimension = %s)" % (self._dimension, self._df))) self._df = control_flow_ops.with_dependencies([assertions], self._df) super(_WishartLinearOperator, self).__init__( dtype=self._scale_operator.dtype, validate_args=validate_args, allow_nan_stats=allow_nan_stats, reparameterization_type=tf.distributions.FULLY_REPARAMETERIZED, parameters=parameters, graph_parents=([self._df, self._dimension] + self._scale_operator.graph_parents), name=name)
def __init__(self, learning_rate, preconditioner_decay_rate=0.95, data_size=1, burnin=25, diagonal_bias=1e-8, name=None, parallel_iterations=10, variable_scope=None): default_name = 'StochasticGradientLangevinDynamics' with tf.name_scope(name, default_name, [ learning_rate, preconditioner_decay_rate, data_size, burnin, diagonal_bias ]): if tf.executing_eagerly(): raise NotImplementedError( 'Eager execution currently not supported for ' ' SGLD optimizer.') if variable_scope is None: var_scope_name = tf.get_default_graph().unique_name( name or default_name) with tf.variable_scope(var_scope_name) as scope: self._variable_scope = scope else: self._variable_scope = variable_scope self._preconditioner_decay_rate = tf.convert_to_tensor( preconditioner_decay_rate, name='preconditioner_decay_rate') self._data_size = tf.convert_to_tensor(data_size, name='data_size') self._burnin = tf.convert_to_tensor(burnin, name='burnin') self._diagonal_bias = tf.convert_to_tensor(diagonal_bias, name='diagonal_bias') self._learning_rate = tf.convert_to_tensor(learning_rate, name='learning_rate') self._parallel_iterations = parallel_iterations with tf.variable_scope(self._variable_scope): self._counter = tf.get_variable('counter', initializer=0, trainable=False) self._preconditioner_decay_rate = control_flow_ops.with_dependencies([ tf.assert_non_negative( self._preconditioner_decay_rate, message='`preconditioner_decay_rate` must be non-negative' ), tf.assert_less_equal( self._preconditioner_decay_rate, 1., message='`preconditioner_decay_rate` must be at most 1.'), ], self._preconditioner_decay_rate) self._data_size = control_flow_ops.with_dependencies([ tf.assert_greater( self._data_size, 0, message='`data_size` must be greater than zero') ], self._data_size) self._burnin = control_flow_ops.with_dependencies([ tf.assert_non_negative( self._burnin, message='`burnin` must be non-negative'), tf.assert_integer(self._burnin, message='`burnin` must be an integer') ], self._burnin) self._diagonal_bias = control_flow_ops.with_dependencies([ tf.assert_non_negative( self._diagonal_bias, message='`diagonal_bias` must be non-negative') ], self._diagonal_bias) super(StochasticGradientLangevinDynamics, self).__init__(use_locking=False, name=name or default_name)
def percentile(x, q, axis=None, interpolation=None, keep_dims=False, validate_args=False, name=None): """Compute the `q`-th percentile of `x`. Given a vector `x`, the `q`-th percentile of `x` is the value `q / 100` of the way from the minimum to the maximum in a sorted copy of `x`. The values and distances of the two nearest neighbors as well as the `interpolation` parameter will determine the percentile if the normalized ranking does not match the location of `q` exactly. This function is the same as the median if `q = 50`, the same as the minimum if `q = 0` and the same as the maximum if `q = 100`. ```python # Get 30th percentile with default ('nearest') interpolation. x = [1., 2., 3., 4.] percentile(x, q=30.) ==> 2.0 # Get 30th percentile with 'lower' interpolation x = [1., 2., 3., 4.] percentile(x, q=30., interpolation='lower') ==> 1.0 # Get 100th percentile (maximum). By default, this is computed over every dim x = [[1., 2.] [3., 4.]] percentile(x, q=100.) ==> 4.0 # Treat the leading dim as indexing samples, and find the 100th quantile (max) # over all such samples. x = [[1., 2.] [3., 4.]] percentile(x, q=100., axis=[0]) ==> [3., 4.] ``` Compare to `numpy.percentile`. Args: x: Floating point `N-D` `Tensor` with `N > 0`. If `axis` is not `None`, `x` must have statically known number of dimensions. q: Scalar `Tensor` in `[0, 100]`. The percentile. axis: Optional `0-D` or `1-D` integer `Tensor` with constant values. The axis that hold independent samples over which to return the desired percentile. If `None` (the default), treat every dimension as a sample dimension, returning a scalar. interpolation : {"lower", "higher", "nearest"}. Default: "nearest" This optional parameter specifies the interpolation method to use when the desired quantile lies between two data points `i < j`: * lower: `i`. * higher: `j`. * nearest: `i` or `j`, whichever is nearest. keep_dims: Python `bool`. If `True`, the last dimension is kept with size 1 If `False`, the last dimension is removed from the output shape. validate_args: Whether to add runtime checks of argument validity. If False, and arguments are incorrect, correct behavior is not guaranteed. name: A Python string name to give this `Op`. Default is "percentile" Returns: A `(N - len(axis))` dimensional `Tensor` of same dtype as `x`, or, if `axis` is `None`, a scalar. Raises: ValueError: If argument 'interpolation' is not an allowed type. """ name = name or "percentile" allowed_interpolations = {"lower", "higher", "nearest"} if interpolation is None: interpolation = "nearest" else: if interpolation not in allowed_interpolations: raise ValueError( "Argument 'interpolation' must be in %s. Found %s" % (allowed_interpolations, interpolation)) with tf.name_scope(name, [x, q]): x = tf.convert_to_tensor(x, name="x") # Double is needed here and below, else we get the wrong index if the array # is huge along axis. q = tf.to_double(q, name="q") _get_static_ndims(q, expect_ndims=0) if validate_args: q = control_flow_ops.with_dependencies([ tf.assert_rank(q, 0), tf.assert_greater_equal(q, tf.to_double(0.)), tf.assert_less_equal(q, tf.to_double(100.)) ], q) if axis is None: y = tf.reshape(x, [-1]) else: axis = tf.convert_to_tensor(axis, name="axis") tf.assert_integer(axis) axis_ndims = _get_static_ndims(axis, expect_static=True, expect_ndims_no_more_than=1) axis_const = tensor_util.constant_value(axis) if axis_const is None: raise ValueError( "Expected argument 'axis' to be statically available. Found: %s" % axis) axis = axis_const if axis_ndims == 0: axis = [axis] axis = [int(a) for a in axis] x_ndims = _get_static_ndims(x, expect_static=True, expect_ndims_at_least=1) axis = _make_static_axis_non_negative(axis, x_ndims) y = _move_dims_to_flat_end(x, axis, x_ndims) frac_at_q_or_above = 1. - q / 100. d = tf.to_double(tf.shape(y)[-1]) if interpolation == "lower": index = tf.ceil((d - 1) * frac_at_q_or_above) elif interpolation == "higher": index = tf.floor((d - 1) * frac_at_q_or_above) elif interpolation == "nearest": index = tf.round((d - 1) * frac_at_q_or_above) # If d is gigantic, then we would have d == d - 1, even in double... So # let's use max/min to avoid out of bounds errors. d = tf.shape(y)[-1] # d - 1 will be distinct from d in int32. index = tf.clip_by_value(tf.to_int32(index), 0, d - 1) # Sort everything, not just the top 'k' entries, which allows multiple calls # to sort only once (under the hood) and use CSE. sorted_y = _sort_tensor(y) # result.shape = B result = sorted_y[..., index] result.set_shape(y.get_shape()[:-1]) if keep_dims: if axis is None: # ones_vec = [1, 1,..., 1], total length = len(S) + len(B). ones_vec = tf.ones(shape=[_get_best_effort_ndims(x)], dtype=tf.int32) result *= tf.ones(ones_vec, dtype=x.dtype) else: result = _insert_back_keep_dims(result, axis) return result
def embed(input_ids, vocab_size, embedding_size, position_offset=0, initializer_range=0.02, max_position_embeddings=512, use_one_hot_embeddings=True): """reur and position embeddings :param input_ids: int Tensor of shape [batch_size, seq_length]. :param vocab_size: number of words in vocab :param embedding_size: dimensionality of the embedding :param position_offset: aka number of cached tokens. :param initializer_range: float. Range of the weight initialization. :param max_position_embeddings: int. Maximum sequence length. :param use_one_hot_embeddings: probably want this to be true :return: [batch_size, seq_length, embedding_size] embedded tensor """ (batch_size, seq_length) = get_shape_list(input_ids, expected_rank=2) embedding_table = tf.get_variable( name='word_embed', shape=[vocab_size, embedding_size], initializer=create_initializer(initializer_range), ) assert_op = tf.assert_less_equal(tf.reduce_max(input_ids), vocab_size - 1) with tf.control_dependencies([assert_op]): if use_one_hot_embeddings: flat_input_ids = tf.reshape(input_ids, [-1]) one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) output_flat = tf.matmul(one_hot_input_ids, embedding_table) else: output_flat = tf.nn.embedding_lookup(embedding_table, input_ids) embedded_input = tf.reshape(output_flat, [batch_size, seq_length, embedding_size]) assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): full_position_embeddings = tf.get_variable( name='pos_embed', shape=[max_position_embeddings, embedding_size], initializer=create_initializer(initializer_range), ) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. if position_offset == 0: embedded_input += tf.slice(full_position_embeddings, [0, 0], [seq_length, -1])[None] else: # Tensorflow is too stupid to allow slicing flat_pos_ids = (tf.range(seq_length, dtype=tf.int32) + position_offset) one_hot_pos_ids = tf.one_hot(flat_pos_ids, depth=max_position_embeddings) # [seq_length, full_position_embeddings], [full_position_embeddings, dim] seq_embeds = tf.matmul(one_hot_pos_ids, full_position_embeddings) embedded_input += seq_embeds[None] # embedded_input += tf.slice(full_position_embeddings[position_offset:], [0, 0], [seq_length, -1])[None] return layer_norm(embedded_input, name='embed_norm'), embedding_table
def __call__(self, inputs, state, scope=None): ( past_cand_symbols, # [batch_size, max_len] past_cand_logprobs,# [batch_size] past_beam_symbols, # [batch_size*self.beam_size, max_len], right-aligned!!! past_beam_logprobs,# [batch_size*self.beam_size] past_cell_state, ) = state batch_size = tf.shape(past_cand_logprobs)[0] # TODO: get as int, if possible full_size = batch_size * self.beam_size cell_inputs = inputs cell_outputs, raw_cell_state = self.cell(cell_inputs, past_cell_state) logprobs = tf.nn.log_softmax(cell_outputs) logprobs_batched = tf.reshape(logprobs + tf.expand_dims(past_beam_logprobs, 1), [-1, self.beam_size * self.num_classes]) logprobs_batched.set_shape((None, self.beam_size * self.num_classes)) # prints and asserts tf.assert_less_equal(logprobs, 0.0) tf.assert_less_equal(past_beam_logprobs, 0.0) masked_logprobs = tf.reshape(logprobs_batched, [-1, self.beam_size * self.num_classes]) # print masked_logprobs.get_shape() beam_logprobs, indices = tf.nn.top_k( masked_logprobs, self.beam_size ) beam_logprobs = tf.reshape(beam_logprobs, [-1]) # For continuing to the next symbols symbols = indices % self.num_classes # [batch_size, self.beam_size] parent_refs = tf.reshape(indices // self.num_classes, [-1]) # [batch_size*self.beam_size] # TODO: this technically doesn't need to be recalculated every loop parent_refs_offsets = tf.mul(tf.floordiv(tf.range(full_size), self.beam_size), self.beam_size) parent_refs = parent_refs + parent_refs_offsets if past_beam_symbols is not None: symbols_history = tf.gather(past_beam_symbols, parent_refs) beam_symbols = tf.concat(1, [tf.reshape(symbols, [-1, 1]), symbols_history]) else: beam_symbols = tf.reshape(symbols, [-1, 1]) # Above ends up outputting reversed. Below doesn't work though because tf doesn't support negative indexing. # last = past_beam_symbols.get_shape()[1] # symbols_history = tf.gather(past_beam_symbols[:,last - 1], parent_refs) # beam_symbols = tf.concat(1, [past_beam_symbols[:,:last-1], tf.reshape(symbols_history, [-1, 1]), tf.reshape(symbols, [-1, 1]), ]) # Handle the output and the cell state shuffling outputs = tf.reshape(symbols, [-1]) # [batch_size*beam_size, 1] cell_state = nest_map( lambda element: tf.gather(element, parent_refs), raw_cell_state ) # Handling for getting a done token # logprobs_done = tf.reshape(logprobs_batched, [-1, self.beam_size, self.num_classes])[:,:,self.stop_token] # done_parent_refs = tf.to_int32(tf.argmax(logprobs_done, 1)) # done_parent_refs_offsets = tf.range(batch_size) * self.beam_size # done_symbols = tf.gather(past_beam_symbols, done_parent_refs + done_parent_refs_offsets) # logprobs_done_max = tf.reduce_max(logprobs_done, 1) # cand_symbols = tf.select(logprobs_done_max > past_cand_logprobs, # done_symbols, # past_cand_symbols) # cand_logprobs = tf.maximum(logprobs_done_max, past_cand_logprobs) cand_symbols = past_cand_symbols # current last symbol in the beam [batch_size*self.beam_size] cand_logprobs = past_cand_logprobs return outputs, ( cand_symbols, cand_logprobs, beam_symbols, beam_logprobs, cell_state, )
def model_fn(features, labels, mode, params): image = features['image'] num_classes = params['model']['num_classes'] is_training = (mode == tf.estimator.ModeKeys.TRAIN) # build convolutional layers conv = build_conv_layers(image, params['model']['conv_layers'], is_training) # load convolutional and dense layers from a checkpoint freeze_variables = {} checkpoint_path = params['training'].get('checkpoint_path') freeze_restored_variables = params['training'].get('freeze_restored_variables', False) if checkpoint_path: tvars = tf.trainable_variables() assignment_map = {} for var in tvars: assignment_map[var.name[:-2]] = var if freeze_restored_variables: freeze_variables[var.name] = True tf.train.init_from_checkpoint(root_dir(checkpoint_path), assignment_map) # build dense layers dense = build_dense_layers(conv, params['model']['dense_layers'], is_training) # get logits if 'subnet' in params: # build NN for each neuron subnet_dropout_rate = params['model']['subnet'].get('subnet_dropout_rate', 0) if subnet_dropout_rate: dense = tf.layers.dropout(inputs=dense, rate=subnet_dropout_rate, training=is_training) logits_layer_params = dict(params['model']['logits_layer']) logits_layer_params['num_units'] = 1 logits_concat = [] for i in range(num_classes): subnet_dense = build_dense_layers(dense, params['model']['subnet']['dense_layers'], is_training) subnet_logits = build_dense_layers(subnet_dense, [logits_layer_params], is_training) logits_concat.append(subnet_logits) logits = tf.concat(logits_concat, axis=-1) else: # a single layer to get a spike logits_layer_params = dict(params['model']['logits_layer']) logits_layer_params['num_units'] = num_classes logits = build_dense_layers(dense, [logits_layer_params], is_training) # return prediction specification if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions={'spikes': logits}) # make sure that images were distorted correctly and display them in TensorBoard max_images = 12 images = image[:max_images] assert_min = tf.assert_greater_equal(tf.reduce_min(images), 0.0, message='Image contains values less than 0') assert_max = tf.assert_less_equal(tf.reduce_max(images), 1.0, message='Image contains values greater than 1') with tf.control_dependencies([assert_min, assert_max]): tf.summary.image('images', tf.cast(images * 255, dtype=tf.uint8), max_outputs=max_images) # compute the loss nan_mask = tf.cast(features['nan_mask'], tf.float32) mse_loss = tf.losses.mean_squared_error(labels=labels, predictions=logits, weights=nan_mask) loss = mse_loss + tf.losses.get_regularization_loss() # get train variables train_vars = [var for var in tf.trainable_variables() if var.name not in freeze_variables] # return training specification if mode == tf.estimator.ModeKeys.TRAIN: train_op = tf.contrib.layers.optimize_loss( loss=loss, global_step=tf.train.get_global_step(), learning_rate=params['training']['learning_rate'], optimizer='Adam', summaries=['learning_rate', 'loss', 'gradients', 'gradient_norm'], variables=train_vars, ) # perform update ops for batch normalization update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group([train_op, update_ops]) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) # evaluation metrics eval_metric_ops = { 'rmse': tf.metrics.root_mean_squared_error(labels=labels, predictions=logits, weights=nan_mask), } # RMSE per column for i in range(num_classes): eval_metric_ops['rmse/column%d' % i] = tf.metrics.root_mean_squared_error(labels=labels[:, i], predictions=logits[:, i], weights=nan_mask[:, i]) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def test_doesnt_raise_when_equal(self): with self.test_session(): small = tf.constant([1, 2], name="small") with tf.control_dependencies([tf.assert_less_equal(small, small)]): out = tf.identity(small) out.eval()