def quantize_image(image): """Changes the range of pixel values to [0, 255] and cast it into uint8""" image = np.reshape(image, (image.shape[1], image.shape[2], 3)) image = tf.convert_to_tensor(image) image = tf.round(image * 255) image = tf.saturate_cast(image, tf.uint8) return image
def quantize_image(image): """ Taken from Balle's implementation """ image = tf.round(image * 255) image = tf.saturate_cast(image, tf.uint8) return image
def summary(images, name): """As a hack, saves image summaries by adding to `eval_metric_ops`.""" images = tf.saturate_cast(images * 255 + 0.5, tf.uint8) eval_metric_ops[name] = (tf.summary.image(name, images, max_outputs=2), tf.no_op())
def _legacy_output_transform_func(*expr, out_mul=1.0, out_add=0.0, out_shrink=1, out_dtype=None): if out_mul != 1.0: expr = [x * out_mul for x in expr] if out_add != 0.0: expr = [x + out_add for x in expr] if out_shrink > 1: ksize = [1, 1, out_shrink, out_shrink] expr = [ tf.nn.avg_pool(x, ksize=ksize, strides=ksize, padding="VALID", data_format="NCHW") for x in expr ] if out_dtype is not None: if tf.as_dtype(out_dtype).is_integer: expr = [tf.round(x) for x in expr] expr = [tf.saturate_cast(x, out_dtype) for x in expr] return expr
def _compute_compression_graph(self, input_image, create_summaries=True): """Compute a forward pass through encoder and decoder. Args: input_image: Input image, range [0, 255] create_summaries: Whether to create summaries Returns: tuple Nodes, BppPair """ with tf.name_scope("image_shape"): image_shape = tf.shape(input_image)[1:-1] # Get H, W. if self.evaluation: num_downscaling = self._encoder.num_downsampling_layers factor = 2**num_downscaling tf.logging.info("Padding to {}".format(factor)) input_image = _pad(input_image, image_shape, factor) with tf.name_scope("scale_down"): input_image_scaled = \ tf.cast(input_image, tf.float32) / 255. info = self._get_encoder_out(input_image_scaled, image_shape) decoder_in = info.decoded total_nbpp = info.total_nbpp total_qbpp = info.total_qbpp bitstream_tensors = info.bitstream_tensors reconstruction, reconstruction_scaled = \ self._compute_reconstruction( decoder_in, image_shape, input_image_scaled.shape) if create_summaries and self._create_image_summaries: tf.summary.image("input_image", tf.saturate_cast(input_image, tf.uint8), max_outputs=1) tf.summary.image("reconstruction", tf.saturate_cast(reconstruction, tf.uint8), max_outputs=1) nodes = Nodes(input_image, input_image_scaled, reconstruction, reconstruction_scaled, latent_quantized=decoder_in) return nodes, BppPair(total_nbpp, total_qbpp), bitstream_tensors
def write_png(filename, image): """Creates graph to write a PNG image file.""" image = tf.squeeze(image, 0) if image.dtype.is_floating: image = tf.round(image) if image.dtype != tf.uint8: image = tf.saturate_cast(image, tf.uint8) string = tf.image.encode_png(image) return tf.io.write_file(filename, string)
def __init__(self, config, is_training, input_ids, attention_mask=None, token_type_ids=None, return_pool=True, scope=None, reuse=False, compute_type=tf.float32): super().__init__(config, is_training) input_shape = model_utils.get_shape_list(input_ids) batch_size = input_shape[0] seq_length = input_shape[1] if attention_mask is None: attention_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) with tf.variable_scope( scope, default_name="bert", reuse=tf.AUTO_REUSE if reuse else None, custom_getter=model_utils.get_custom_getter(compute_type)): with tf.variable_scope("embeddings"): self.embedding_output, self.embedding_table = albert_embedding( config=self.config, input_ids=input_ids, token_type_ids=token_type_ids, ) with tf.variable_scope("encoder"): attention_mask = model_utils.create_bert_mask( input_ids, attention_mask) encoder_outputs = albert_encoder(config=self.config, input_tensor=tf.saturate_cast( self.embedding_output, compute_type), attention_mask=attention_mask) if return_pool: with tf.variable_scope("pooler"): pooled_output = layers.pooler_layer( sequence_output=encoder_outputs[0], hidden_size=self.config.hidden_size, initializer_range=self.config.initializer_range) else: pooled_output = None # (pooled output, sequence output, all layer outputs, all layer att probs) self.outputs = (pooled_output, ) + encoder_outputs
def convert_images_to_uint8(images, drange=[-1,1], nchw_to_nhwc=False, shrink=1): """Convert a minibatch of images from float32 to uint8 with configurable dynamic range. Can be used as an output transformation for Network.run(). """ images = tf.cast(images, tf.float32) if shrink > 1: ksize = [1, 1, shrink, shrink] images = tf.nn.avg_pool(images, ksize=ksize, strides=ksize, padding="VALID", data_format="NCHW") if nchw_to_nhwc: images = tf.transpose(images, [0, 2, 3, 1]) scale = 255 / (drange[1] - drange[0]) images = images * scale + (0.5 - drange[0] * scale) return tf.saturate_cast(images, tf.uint8)
def run( self, *in_arrays, return_as_list=False, # True = return a list of NumPy arrays, False = return a single NumPy array, or a tuple if there are multiple outputs. print_progress=False, # Print progress to the console? Useful for very large input arrays. minibatch_size=None, # Maximum minibatch size to use, None = disable batching. num_gpus=1, # Number of GPUs to use. out_mul=1.0, # Multiplicative constant to apply to the output(s). out_add=0.0, # Additive constant to apply to the output(s). out_shrink=1, # Shrink the spatial dimensions of the output(s) by the given factor. out_dtype=None, # Convert the output to the specified data type. **dynamic_kwargs ): # Additional keyword arguments to pass into the network construction function. # assert len(in_arrays) == self.num_inputs num_items = in_arrays[0].shape[0] print(num_items) if minibatch_size is None: minibatch_size = num_items key = str([ list(sorted(dynamic_kwargs.items())), num_gpus, out_mul, out_add, out_shrink, out_dtype ]) # Build graph. if key not in self._run_cache: with absolute_name_scope(self.scope + '/Run'), tf.control_dependencies(None): in_split = list( zip(*[tf.split(x, num_gpus) for x in self.input_templates])) out_split = [] for gpu in range(num_gpus): with tf.device('/gpu:%d' % gpu): out_expr = self.get_output_for(*in_split[gpu], return_as_list=True, **dynamic_kwargs) if out_mul != 1.0: out_expr = [x * out_mul for x in out_expr] if out_add != 0.0: out_expr = [x + out_add for x in out_expr] if out_shrink > 1: ksize = [1, 1, out_shrink, out_shrink] out_expr = [ tf.nn.avg_pool(x, ksize=ksize, strides=ksize, padding='VALID', data_format='NCHW') for x in out_expr ] if out_dtype is not None: if tf.as_dtype(out_dtype).is_integer: out_expr = [tf.round(x) for x in out_expr] out_expr = [ tf.saturate_cast(x, out_dtype) for x in out_expr ] out_split.append(out_expr) self._run_cache[key] = [ tf.concat(outputs, axis=0) for outputs in zip(*out_split) ] # Run minibatches. out_expr = self._run_cache[key] out_arrays = [ np.empty([num_items] + shape_to_list(expr.shape)[1:], expr.dtype.name) for expr in out_expr ] for mb_begin in range(0, num_items, minibatch_size): if print_progress: print('\r%d / %d' % (mb_begin, num_items), end='') mb_end = min(mb_begin + minibatch_size, num_items) mb_in = [src[mb_begin:mb_end] for src in in_arrays] # config = tf.compat.v1.ConfigProto(log_device_placement=True, allow_soft_placement=True) print("Num GPUs Available: ", tf.config.list_physical_devices()) mb_out = tf.Session().run(out_expr, dict(zip(self.input_templates, mb_in))) # mb_out = tf.get_default_session().run(out_expr, dict(zip(self.input_templates, mb_in))) for dst, src in zip(out_arrays, mb_out): dst[mb_begin:mb_end] = src # Done. if print_progress: print('\r%d / %d' % (num_items, num_items)) if not return_as_list: out_arrays = out_arrays[0] if len(out_arrays) == 1 else tuple( out_arrays) return out_arrays
def __init__(self, config, is_training, input_ids, attention_mask=None, token_type_ids=None, return_pool=True, scope=None, reuse=False, compute_type=tf.float32): super().__init__(config, is_training) input_shape = model_utils.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] if attention_mask is None: attention_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) with tf.variable_scope( scope, default_name="electra", reuse=tf.AUTO_REUSE if reuse else None, custom_getter=model_utils.get_custom_getter(compute_type)): with tf.variable_scope("embeddings"): self.embedding_output, self.embedding_table = bert_embedding( config=self.config, input_ids=input_ids, token_type_ids=token_type_ids, add_position_embedding=True) if model_utils.get_shape_list( self.embedding_output)[-1] != self.config.hidden_size: self.embedding_output = layers.dense( self.embedding_output, self.config.hidden_size, 'embeddings_project', initializer_range=self.config.initializer_range) with tf.variable_scope("encoder"): attention_mask = model_utils.create_bert_mask( input_ids, attention_mask) encoder_outputs = bert_encoder(config=self.config, input_tensor=tf.saturate_cast( self.embedding_output, compute_type), attention_mask=attention_mask, use_relative_position=False) # electra 的 pool output是直接返回first token的vec if return_pool: pooled_output = encoder_outputs[0][:, 0] else: pooled_output = None # (pooled output, sequence output, all layer outputs, all layer att probs) self.outputs = (pooled_output, ) + encoder_outputs
def attention_layer( from_tensor, to_tensor, attention_mask=None, num_attention_heads=1, size_per_head=512, query_act=None, key_act=None, value_act=None, attention_probs_dropout_prob=0.0, initializer_range=0.02, do_return_2d_tensor=False, batch_size=None, from_seq_length=None, to_seq_length=None, use_relative_position=False, # nezha使用的相对位置 compute_type=tf.float32, # 使用16还是32位float do_return_attentions_probs=False): def transpose_for_scores(input_tensor, batch_size, num_attention_heads, seq_length, width): # 将输入tensor进行转置 output_tensor = tf.reshape( input_tensor, [batch_size, seq_length, num_attention_heads, width]) # 将 tensor 转为 bz, num_heads, seq_len, width output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) return output_tensor from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) # 保证 shape 一致 if len(from_shape) != len(to_shape): raise ValueError( "The rank of `from_tensor` must match the rank of `to_tensor`.") if len(from_shape) == 3: batch_size = from_shape[0] from_seq_length = from_shape[1] to_seq_length = to_shape[1] elif len(from_shape) == 2: if (batch_size is None or from_seq_length is None or to_seq_length is None): raise ValueError( "When passing in rank 2 tensors to attention_layer, the values " "for `batch_size`, `from_seq_length`, and `to_seq_length` " "must all be specified.") # Scalar dimensions referenced here: # B = batch size (number of sequences) # F = `from_tensor` sequence length # T = `to_tensor` sequence length # N = `num_attention_heads` # H = `size_per_head` # 将 tensor 转为二维张量,bz * seq_len,width from_tensor_2d = reshape_to_matrix(from_tensor) to_tensor_2d = reshape_to_matrix(to_tensor) # `query_layer` = [B*F, N*H] # [bz* seq_len, width] --> [bz * seq_len, num_heads * one_head_size] query_layer = tf.layers.dense( from_tensor_2d, num_attention_heads * size_per_head, activation=query_act, name="query", kernel_initializer=create_initializer(initializer_range)) # `key_layer` = [B*T, N*H] # [bz* seq_len, width] --> [bz * seq_len, num_heads * one_head_size] key_layer = tf.layers.dense( to_tensor_2d, num_attention_heads * size_per_head, activation=key_act, name="key", kernel_initializer=create_initializer(initializer_range)) # `value_layer` = [B*T, N*H] # [bz* seq_len, width] --> [bz * seq_len, num_heads * one_head_size] value_layer = tf.layers.dense( to_tensor_2d, num_attention_heads * size_per_head, activation=value_act, name="value", kernel_initializer=create_initializer(initializer_range)) # `query_layer` = [B, N, F, H] # [bz * seq_len, num_heads * one_head_size] --> [bz, num_heads, seq_len, one_head_size] query_layer = transpose_for_scores(query_layer, batch_size, num_attention_heads, from_seq_length, size_per_head) # `key_layer` = [B, N, T, H] # [bz * seq_len, num_heads * one_head_size] --> [bz, num_heads, seq_len, one_head_size] key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, to_seq_length, size_per_head) # Take the dot product between "query" and "key" to get the raw # attention scores. # `attention_scores` = [B, N, F, T] # [B, N, F, H] * [B, N, T, H] = [B, N, F, T] # 得到self attention score attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) if use_relative_position: assert from_seq_length == to_seq_length max_relative_position = 64 # `relation_keys` = [F|T, F|T, H] relations_keys = _generate_relative_positions_embeddings( to_seq_length, size_per_head, max_relative_position, cache=False) relations_keys = tf.saturate_cast(relations_keys, compute_type) # query_layer_t is [F, B, N, H] query_layer_t = tf.transpose(query_layer, [2, 0, 1, 3]) # query_layer_r is [F, B * N, H] query_layer_r = tf.reshape( query_layer_t, [from_seq_length, batch_size * num_attention_heads, size_per_head]) # key_position_scores is [F, B * N, F|T] key_position_scores = tf.matmul(query_layer_r, relations_keys, transpose_b=True) # key_position_scores_r is [F, B , N, F|T] key_position_scores_r = tf.reshape(key_position_scores, [ from_seq_length, batch_size, num_attention_heads, from_seq_length ]) # key_position_scores_r_t is [B, N, F, F|T] key_position_scores_r_t = tf.transpose(key_position_scores_r, [1, 2, 0, 3]) attention_scores = attention_scores + key_position_scores_r_t # 对scores进行缩放,系数为单头size的平方 attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head))) if attention_mask is not None: # `attention_mask` = [B, 1, F, T] # 为了维度对齐,先进行维度扩充 attention_mask = tf.expand_dims(attention_mask, axis=[1]) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. adder = (1.0 - tf.cast(attention_mask, attention_scores.dtype)) * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. # 保留mask部分的score attention_scores += adder # Normalize the attention scores to probabilities. # `attention_probs` = [B, N, F, T] # 归一化后,原始mask为0部分就会趋于0 attention_probs = tf.nn.softmax(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = dropout(attention_probs, attention_probs_dropout_prob) # `value_layer` = [B, T, N, H] # [bz * seq_len, num_heads * one_head_size] --> [bz, seq_len, num_heads, one_head_size] value_layer = tf.reshape( value_layer, [batch_size, to_seq_length, num_attention_heads, size_per_head]) # `value_layer` = [B, N, T, H] # [bz, seq_len, num_heads, one_head_size] --> [bz, num_heads, seq_len, one_head_size] value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) # `context_layer` = [B, N, F, H] # q 和 k 点积得到的score与v相乘,得到context # [B, N, F, T] * [B, N, T, H] = [B, N, F, H] context_layer = tf.matmul(attention_probs, value_layer) if use_relative_position: # `relation_values` = [F|T, F|T, H] relations_values = _generate_relative_positions_embeddings( to_seq_length, size_per_head, max_relative_position, cache=False) relations_values = tf.saturate_cast(relations_values, compute_type) # attention_probs_t is [F, B, N, T] attention_probs_t = tf.transpose(attention_probs, [2, 0, 1, 3]) # attention_probs_r is [F, B * N, T] attention_probs_r = tf.reshape( attention_probs_t, [from_seq_length, batch_size * num_attention_heads, to_seq_length]) # key_position_scores is [F, B * N, H] value_position_scores = tf.matmul(attention_probs_r, relations_values, transpose_b=False) # value_position_scores_r is [F, B , N, H] value_position_scores_r = tf.reshape( value_position_scores, [from_seq_length, batch_size, num_attention_heads, size_per_head]) # value_position_scores_r_t is [B, N, F, H] value_position_scores_r_t = tf.transpose(value_position_scores_r, [1, 2, 0, 3]) # attention_scores = attention_scores + value_position_scores_r_t context_layer = context_layer + value_position_scores_r_t # `context_layer` = [B, F, N, H] # 将得到的context转为 bz,seq_len, num_heads, one_head_size context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) if do_return_2d_tensor: # `context_layer` = [B*F, N*H] context_layer = tf.reshape(context_layer, [ batch_size * from_seq_length, num_attention_heads * size_per_head ]) else: # `context_layer` = [B, F, N*H] context_layer = tf.reshape( context_layer, [batch_size, from_seq_length, num_attention_heads * size_per_head]) if do_return_attentions_probs: outputs = (context_layer, attention_probs) else: outputs = (context_layer, ) return outputs
def __init__(self, bert_config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, scope=None, embedding_size=None, input_embeddings=None, input_reprs=None, update_embeddings=True, untied_embeddings=False, use_fp16=False): """Constructor for BertModel. Args: bert_config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, it is much faster if this is True, on the CPU or GPU, it is faster if this is False. scope: (optional) variable scope. Defaults to "electra". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ bert_config = copy.deepcopy(bert_config) if not is_training: bert_config.hidden_dropout_prob = 0.0 bert_config.attention_probs_dropout_prob = 0.0 input_shape = get_shape_list(token_type_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) assert token_type_ids is not None embedding_scope = (scope if untied_embeddings else "electra") + "/embeddings" if input_reprs is None: if input_embeddings is None: with tf.variable_scope(embedding_scope, reuse=tf.AUTO_REUSE): # Perform embedding lookup on the word ids if embedding_size is None: embedding_size = bert_config.hidden_size self.token_embeddings, self.embedding_table = embedding_lookup( input_ids=input_ids, vocab_size=bert_config.vocab_size, embedding_size=embedding_size, initializer_range=bert_config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) else: self.token_embeddings = input_embeddings with tf.variable_scope(embedding_scope, reuse=tf.AUTO_REUSE): # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = embedding_postprocessor( input_tensor=self.token_embeddings, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=bert_config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=bert_config.initializer_range, max_position_embeddings=bert_config. max_position_embeddings, dropout_prob=bert_config.hidden_dropout_prob) else: self.embedding_output = input_reprs if not update_embeddings: self.embedding_output = tf.stop_gradient(self.embedding_output) with tf.variable_scope(scope, default_name="electra"): if self.embedding_output.shape[-1] != bert_config.hidden_size: self.embedding_output = tf.layers.dense( self.embedding_output, bert_config.hidden_size, name="embeddings_project") with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = create_attention_mask_from_input_mask( token_type_ids, input_mask, use_fp16) # Run the stacked transformer. Output shapes # sequence_output: [batch_size, seq_length, hidden_size] # pooled_output: [batch_size, hidden_size] # all_encoder_layers: [n_layers, batch_size, seq_length, hidden_size]. # attn_maps: [n_layers, batch_size, n_heads, seq_length, seq_length] self.all_layer_outputs, self.attn_maps = transformer_model( input_tensor=tf.saturate_cast(self.embedding_output, infer_dtype(use_fp16)), attention_mask=attention_mask, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_act_fn=get_activation(bert_config.hidden_act), hidden_dropout_prob=bert_config.hidden_dropout_prob, attention_probs_dropout_prob=bert_config. attention_probs_dropout_prob, initializer_range=bert_config.initializer_range, do_return_all_layers=True, use_fp16=use_fp16) self.sequence_output = self.all_layer_outputs[-1] self.pooled_output = self.sequence_output[:, 0]
def convert_float_to_uint8(image): image = tf.round(image * 255) image = tf.saturate_cast(image, tf.uint8) return image
def attention( input_tensor, attention_mask=None, hidden_size=768, num_attention_heads=12, attention_probs_dropout_prob=0.0, initializer_range=0.02, use_relative_position=False, # nezha使用的相对位置 do_return_attentions_probs=False): """ self attention层 :param input_tensor: bz, seq_len , hidden_size :param attention_mask: :param hidden_size: :param num_attention_heads: :param attention_probs_dropout_prob: :param initializer_range: :param use_relative_position: :param do_return_attentions_probs: :return: """ def transpose(tensor, shape): output_tensor = tf.reshape(tensor, shape=shape) output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) return output_tensor shape = model_utils.get_shape_list(input_tensor, expected_rank=3) size_per_head = hidden_size // num_attention_heads query_layer = dense(input_tensor=input_tensor, output_size=num_attention_heads * size_per_head, activation=None, name='query', initializer_range=initializer_range) key_layer = dense(input_tensor=input_tensor, output_size=num_attention_heads * size_per_head, activation=None, name='key', initializer_range=initializer_range) value_layer = dense(input_tensor=input_tensor, output_size=num_attention_heads * size_per_head, activation=None, name='value', initializer_range=initializer_range) query_layer = transpose(query_layer, shape=shape[:-1] + [num_attention_heads, size_per_head]) key_layer = transpose(key_layer, shape=shape[:-1] + [num_attention_heads, size_per_head]) attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) if use_relative_position: # `relation_keys` = [F|T, F|T, H] relations_keys = model_utils._generate_relative_positions_embeddings( shape[1], size_per_head, 64, cache=False) relations_keys = tf.saturate_cast(relations_keys, tf.float32) # query_layer_t is [F, B, N, H] query_layer_t = tf.transpose(query_layer, [2, 0, 1, 3]) # query_layer_r is [F, B * N, H] query_layer_r = tf.reshape( query_layer_t, [shape[1], shape[0] * num_attention_heads, size_per_head]) # key_position_scores is [F, B * N, F|T] key_position_scores = tf.matmul(query_layer_r, relations_keys, transpose_b=True) # key_position_scores_r is [F, B , N, F|T] key_position_scores_r = tf.reshape( key_position_scores, [shape[1], shape[0], num_attention_heads, shape[1]]) # key_position_scores_r_t is [B, N, F, F|T] key_position_scores_r_t = tf.transpose(key_position_scores_r, [1, 2, 0, 3]) attention_scores = attention_scores + key_position_scores_r_t # 对scores进行缩放,系数为单头size的平方 attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head))) if attention_mask is not None: # `attention_mask` = [B, 1, F, T] # 为了维度对齐,先进行维度扩充 attention_mask = tf.expand_dims(attention_mask, axis=[1]) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. adder = (1.0 - tf.cast(attention_mask, attention_scores.dtype)) * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. # 保留mask部分的score attention_scores += adder attention_probs = tf.nn.softmax(attention_scores) attention_probs = model_utils.dropout(attention_probs, attention_probs_dropout_prob) # `value_layer` = [B, T, N, H] # [bz * seq_len, num_heads * one_head_size] --> [bz, seq_len, num_heads, one_head_size] value_layer = transpose(value_layer, shape=shape[:-1] + [num_attention_heads, size_per_head]) # `context_layer` = [B, N, F, H] # q 和 k 点积得到的score与v相乘,得到context # [B, N, F, T] * [B, N, T, H] = [B, N, F, H] context_layer = tf.matmul(attention_probs, value_layer) if use_relative_position: # `relation_values` = [F|T, F|T, H] relations_values = model_utils._generate_relative_positions_embeddings( shape[1], size_per_head, 64, cache=False) relations_values = tf.saturate_cast(relations_values, tf.float32) # attention_probs_t is [F, B, N, T] attention_probs_t = tf.transpose(attention_probs, [2, 0, 1, 3]) # attention_probs_r is [F, B * N, T] attention_probs_r = tf.reshape( attention_probs_t, [shape[1], shape[0] * num_attention_heads, shape[1]]) # key_position_scores is [F, B * N, H] value_position_scores = tf.matmul(attention_probs_r, relations_values, transpose_b=False) # value_position_scores_r is [F, B , N, H] value_position_scores_r = tf.reshape( value_position_scores, [shape[1], shape[0], num_attention_heads, size_per_head]) # value_position_scores_r_t is [B, N, F, H] value_position_scores_r_t = tf.transpose(value_position_scores_r, [1, 2, 0, 3]) # attention_scores = attention_scores + value_position_scores_r_t context_layer = context_layer + value_position_scores_r_t context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) context_layer = tf.reshape(context_layer, shape=shape[:2] + [num_attention_heads * size_per_head]) if do_return_attentions_probs: outputs = (context_layer, attention_probs) else: outputs = (context_layer, ) return outputs
def post_process_gradients(grads_and_vars, summaries, lr, clip_gradients, larc_params): """Applies post processing to gradients, i.e. clipping, LARC, summaries.""" if 'global_gradient_norm' in summaries: tf.summary.scalar('global_gradient_norm', _global_norm_with_cast(grads_and_vars)) # Optionally clip gradients by global norm. if clip_gradients is not None: grads_and_vars = _clip_gradients_by_norm(grads_and_vars, clip_gradients) # Add histograms for variables, gradients and gradient norms. if 'global_gradient_norm' in summaries: for gradient, variable in grads_and_vars: if isinstance(gradient, tf.IndexedSlices): grad_values = gradient.values else: grad_values = gradient if isinstance(variable, tf.IndexedSlices): var_values = variable.values else: var_values = variable if grad_values is not None: var_name = variable.name.replace(':', '_') if 'gradients' in summaries: # need to mask nans for automatic loss scaling tf.summary.histogram('gradients/%s' % var_name, mask_nans(grad_values)) if 'gradient_norm' in summaries: tf.summary.scalar('gradient_norm/%s' % var_name, tf.norm(grad_values)) if 'variables' in summaries: tf.summary.histogram('variables/%s' % var_name, var_values) if 'variable_norm' in summaries: tf.summary.scalar('variable_norm/%s' % var_name, tf.norm(var_values)) if clip_gradients is not None and 'global_gradient_norm' in summaries: tf.summary.scalar( 'global_clipped_gradient_norm', _global_norm_with_cast(grads_and_vars), ) # LARC gradient re-scaling if larc_params is not None: check_params( config=larc_params, required_dict={'larc_eta': float}, optional_dict={ 'larc_mode': ['clip', 'scale'], 'min_update': float, 'epsilon': float, }, ) larc_eta = larc_params['larc_eta'] larc_mode = larc_params.get('larc_mode', 'clip') min_update = larc_params.get('min_update', 1e-7) eps = larc_params.get('epsilon', 1e-7) grads_and_vars_larc = [None] * len(grads_and_vars) for idx, (g, v) in enumerate(grads_and_vars): var_dtype = v.dtype v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2) g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2) if larc_mode == 'clip': larc_grad_update = tf.maximum( larc_eta * v_norm / (lr * (g_norm + eps)), min_update) if 'larc_summaries' in summaries: tf.summary.scalar( 'larc_clip_on/{}'.format(v.name), tf.cast(tf.less(larc_grad_update, 1.0), tf.int32), ) larc_grad_update = tf.minimum(larc_grad_update, 1.0) else: larc_grad_update = tf.maximum( larc_eta * v_norm / (g_norm + eps), min_update) larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype) grads_and_vars_larc[idx] = (larc_grad_update * g, v) # adding additional summary if 'larc_summaries' in summaries: tf.summary.scalar('larc_grad_update/{}'.format(v.name), larc_grad_update) tf.summary.scalar( 'larc_final_lr/{}'.format(v.name), tf.cast(lr, var_dtype) * larc_grad_update, ) grads_and_vars = grads_and_vars_larc return grads_and_vars
def quantize_image(image): image = tf.round(image * 255) image = tf.saturate_cast(image, tf.uint8) return image
def quantize_image(image): image = tf.math.floor(image + 0.5) image = tf.saturate_cast(image, tf.uint8) return image