Esempio n. 1
0
def quantize_image(image):
    """Changes the range of pixel values to [0, 255] and cast it into uint8"""
    image = np.reshape(image, (image.shape[1], image.shape[2], 3))
    image = tf.convert_to_tensor(image)
    image = tf.round(image * 255)
    image = tf.saturate_cast(image, tf.uint8)
    return image
Esempio n. 2
0
def quantize_image(image):
    """
    Taken from Balle's implementation
    """
    image = tf.round(image * 255)
    image = tf.saturate_cast(image, tf.uint8)
    return image
Esempio n. 3
0
 def summary(images, name):
     """As a hack, saves image summaries by adding to `eval_metric_ops`."""
     images = tf.saturate_cast(images * 255 + 0.5, tf.uint8)
     eval_metric_ops[name] = (tf.summary.image(name,
                                               images,
                                               max_outputs=2),
                              tf.no_op())
Esempio n. 4
0
def _legacy_output_transform_func(*expr,
                                  out_mul=1.0,
                                  out_add=0.0,
                                  out_shrink=1,
                                  out_dtype=None):
    if out_mul != 1.0:
        expr = [x * out_mul for x in expr]

    if out_add != 0.0:
        expr = [x + out_add for x in expr]

    if out_shrink > 1:
        ksize = [1, 1, out_shrink, out_shrink]
        expr = [
            tf.nn.avg_pool(x,
                           ksize=ksize,
                           strides=ksize,
                           padding="VALID",
                           data_format="NCHW") for x in expr
        ]

    if out_dtype is not None:
        if tf.as_dtype(out_dtype).is_integer:
            expr = [tf.round(x) for x in expr]
        expr = [tf.saturate_cast(x, out_dtype) for x in expr]
    return expr
Esempio n. 5
0
    def _compute_compression_graph(self, input_image, create_summaries=True):
        """Compute a forward pass through encoder and decoder.

    Args:
      input_image: Input image, range [0, 255]
      create_summaries: Whether to create summaries

    Returns:
      tuple Nodes, BppPair
    """
        with tf.name_scope("image_shape"):
            image_shape = tf.shape(input_image)[1:-1]  # Get H, W.

        if self.evaluation:
            num_downscaling = self._encoder.num_downsampling_layers
            factor = 2**num_downscaling
            tf.logging.info("Padding to {}".format(factor))
            input_image = _pad(input_image, image_shape, factor)

        with tf.name_scope("scale_down"):
            input_image_scaled = \
                tf.cast(input_image, tf.float32) / 255.

        info = self._get_encoder_out(input_image_scaled, image_shape)
        decoder_in = info.decoded
        total_nbpp = info.total_nbpp
        total_qbpp = info.total_qbpp
        bitstream_tensors = info.bitstream_tensors

        reconstruction, reconstruction_scaled = \
            self._compute_reconstruction(
                decoder_in, image_shape, input_image_scaled.shape)

        if create_summaries and self._create_image_summaries:
            tf.summary.image("input_image",
                             tf.saturate_cast(input_image, tf.uint8),
                             max_outputs=1)
            tf.summary.image("reconstruction",
                             tf.saturate_cast(reconstruction, tf.uint8),
                             max_outputs=1)

        nodes = Nodes(input_image,
                      input_image_scaled,
                      reconstruction,
                      reconstruction_scaled,
                      latent_quantized=decoder_in)
        return nodes, BppPair(total_nbpp, total_qbpp), bitstream_tensors
Esempio n. 6
0
def write_png(filename, image):
    """Creates graph to write a PNG image file."""
    image = tf.squeeze(image, 0)
    if image.dtype.is_floating:
        image = tf.round(image)
    if image.dtype != tf.uint8:
        image = tf.saturate_cast(image, tf.uint8)
    string = tf.image.encode_png(image)
    return tf.io.write_file(filename, string)
Esempio n. 7
0
    def __init__(self,
                 config,
                 is_training,
                 input_ids,
                 attention_mask=None,
                 token_type_ids=None,
                 return_pool=True,
                 scope=None,
                 reuse=False,
                 compute_type=tf.float32):
        super().__init__(config, is_training)

        input_shape = model_utils.get_shape_list(input_ids)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if attention_mask is None:
            attention_mask = tf.ones(shape=[batch_size, seq_length],
                                     dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, seq_length],
                                      dtype=tf.int32)

        with tf.variable_scope(
                scope,
                default_name="bert",
                reuse=tf.AUTO_REUSE if reuse else None,
                custom_getter=model_utils.get_custom_getter(compute_type)):
            with tf.variable_scope("embeddings"):
                self.embedding_output, self.embedding_table = albert_embedding(
                    config=self.config,
                    input_ids=input_ids,
                    token_type_ids=token_type_ids,
                )

            with tf.variable_scope("encoder"):
                attention_mask = model_utils.create_bert_mask(
                    input_ids, attention_mask)
                encoder_outputs = albert_encoder(config=self.config,
                                                 input_tensor=tf.saturate_cast(
                                                     self.embedding_output,
                                                     compute_type),
                                                 attention_mask=attention_mask)
            if return_pool:
                with tf.variable_scope("pooler"):
                    pooled_output = layers.pooler_layer(
                        sequence_output=encoder_outputs[0],
                        hidden_size=self.config.hidden_size,
                        initializer_range=self.config.initializer_range)
            else:
                pooled_output = None
        # (pooled output, sequence output, all layer outputs, all layer att probs)
        self.outputs = (pooled_output, ) + encoder_outputs
Esempio n. 8
0
def convert_images_to_uint8(images, drange=[-1,1], nchw_to_nhwc=False, shrink=1):
    """Convert a minibatch of images from float32 to uint8 with configurable dynamic range.
    Can be used as an output transformation for Network.run().
    """
    images = tf.cast(images, tf.float32)
    if shrink > 1:
        ksize = [1, 1, shrink, shrink]
        images = tf.nn.avg_pool(images, ksize=ksize, strides=ksize, padding="VALID", data_format="NCHW")
    if nchw_to_nhwc:
        images = tf.transpose(images, [0, 2, 3, 1])
    scale = 255 / (drange[1] - drange[0])
    images = images * scale + (0.5 - drange[0] * scale)
    return tf.saturate_cast(images, tf.uint8)
    def run(
        self,
        *in_arrays,
        return_as_list=False,  # True = return a list of NumPy arrays, False = return a single NumPy array, or a tuple if there are multiple outputs.
        print_progress=False,  # Print progress to the console? Useful for very large input arrays.
        minibatch_size=None,  # Maximum minibatch size to use, None = disable batching.
        num_gpus=1,  # Number of GPUs to use.
        out_mul=1.0,  # Multiplicative constant to apply to the output(s).
        out_add=0.0,  # Additive constant to apply to the output(s).
        out_shrink=1,  # Shrink the spatial dimensions of the output(s) by the given factor.
        out_dtype=None,  # Convert the output to the specified data type.
        **dynamic_kwargs
    ):  # Additional keyword arguments to pass into the network construction function.

        # assert len(in_arrays) == self.num_inputs
        num_items = in_arrays[0].shape[0]
        print(num_items)
        if minibatch_size is None:
            minibatch_size = num_items
        key = str([
            list(sorted(dynamic_kwargs.items())), num_gpus, out_mul, out_add,
            out_shrink, out_dtype
        ])

        # Build graph.
        if key not in self._run_cache:
            with absolute_name_scope(self.scope +
                                     '/Run'), tf.control_dependencies(None):
                in_split = list(
                    zip(*[tf.split(x, num_gpus)
                          for x in self.input_templates]))
                out_split = []
                for gpu in range(num_gpus):
                    with tf.device('/gpu:%d' % gpu):
                        out_expr = self.get_output_for(*in_split[gpu],
                                                       return_as_list=True,
                                                       **dynamic_kwargs)
                        if out_mul != 1.0:
                            out_expr = [x * out_mul for x in out_expr]
                        if out_add != 0.0:
                            out_expr = [x + out_add for x in out_expr]
                        if out_shrink > 1:
                            ksize = [1, 1, out_shrink, out_shrink]
                            out_expr = [
                                tf.nn.avg_pool(x,
                                               ksize=ksize,
                                               strides=ksize,
                                               padding='VALID',
                                               data_format='NCHW')
                                for x in out_expr
                            ]
                        if out_dtype is not None:
                            if tf.as_dtype(out_dtype).is_integer:
                                out_expr = [tf.round(x) for x in out_expr]
                            out_expr = [
                                tf.saturate_cast(x, out_dtype)
                                for x in out_expr
                            ]
                        out_split.append(out_expr)
                self._run_cache[key] = [
                    tf.concat(outputs, axis=0) for outputs in zip(*out_split)
                ]

        # Run minibatches.
        out_expr = self._run_cache[key]
        out_arrays = [
            np.empty([num_items] + shape_to_list(expr.shape)[1:],
                     expr.dtype.name) for expr in out_expr
        ]
        for mb_begin in range(0, num_items, minibatch_size):
            if print_progress:
                print('\r%d / %d' % (mb_begin, num_items), end='')
            mb_end = min(mb_begin + minibatch_size, num_items)
            mb_in = [src[mb_begin:mb_end] for src in in_arrays]
            # config = tf.compat.v1.ConfigProto(log_device_placement=True, allow_soft_placement=True)
            print("Num GPUs Available: ", tf.config.list_physical_devices())
            mb_out = tf.Session().run(out_expr,
                                      dict(zip(self.input_templates, mb_in)))
            # mb_out = tf.get_default_session().run(out_expr, dict(zip(self.input_templates, mb_in)))
            for dst, src in zip(out_arrays, mb_out):
                dst[mb_begin:mb_end] = src

        # Done.
        if print_progress:
            print('\r%d / %d' % (num_items, num_items))
        if not return_as_list:
            out_arrays = out_arrays[0] if len(out_arrays) == 1 else tuple(
                out_arrays)
        return out_arrays
Esempio n. 10
0
    def __init__(self,
                 config,
                 is_training,
                 input_ids,
                 attention_mask=None,
                 token_type_ids=None,
                 return_pool=True,
                 scope=None,
                 reuse=False,
                 compute_type=tf.float32):
        super().__init__(config, is_training)

        input_shape = model_utils.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if attention_mask is None:
            attention_mask = tf.ones(shape=[batch_size, seq_length],
                                     dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, seq_length],
                                      dtype=tf.int32)

        with tf.variable_scope(
                scope,
                default_name="electra",
                reuse=tf.AUTO_REUSE if reuse else None,
                custom_getter=model_utils.get_custom_getter(compute_type)):
            with tf.variable_scope("embeddings"):
                self.embedding_output, self.embedding_table = bert_embedding(
                    config=self.config,
                    input_ids=input_ids,
                    token_type_ids=token_type_ids,
                    add_position_embedding=True)

            if model_utils.get_shape_list(
                    self.embedding_output)[-1] != self.config.hidden_size:
                self.embedding_output = layers.dense(
                    self.embedding_output,
                    self.config.hidden_size,
                    'embeddings_project',
                    initializer_range=self.config.initializer_range)

            with tf.variable_scope("encoder"):
                attention_mask = model_utils.create_bert_mask(
                    input_ids, attention_mask)
                encoder_outputs = bert_encoder(config=self.config,
                                               input_tensor=tf.saturate_cast(
                                                   self.embedding_output,
                                                   compute_type),
                                               attention_mask=attention_mask,
                                               use_relative_position=False)

        # electra 的 pool output是直接返回first token的vec
        if return_pool:
            pooled_output = encoder_outputs[0][:, 0]
        else:
            pooled_output = None
        # (pooled output, sequence output, all layer outputs, all layer att probs)
        self.outputs = (pooled_output, ) + encoder_outputs
Esempio n. 11
0
def attention_layer(
        from_tensor,
        to_tensor,
        attention_mask=None,
        num_attention_heads=1,
        size_per_head=512,
        query_act=None,
        key_act=None,
        value_act=None,
        attention_probs_dropout_prob=0.0,
        initializer_range=0.02,
        do_return_2d_tensor=False,
        batch_size=None,
        from_seq_length=None,
        to_seq_length=None,
        use_relative_position=False,  # nezha使用的相对位置
        compute_type=tf.float32,  # 使用16还是32位float
        do_return_attentions_probs=False):
    def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
                             seq_length, width):
        # 将输入tensor进行转置
        output_tensor = tf.reshape(
            input_tensor, [batch_size, seq_length, num_attention_heads, width])

        # 将 tensor 转为 bz, num_heads, seq_len, width
        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
        return output_tensor

    from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
    to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])

    # 保证 shape 一致
    if len(from_shape) != len(to_shape):
        raise ValueError(
            "The rank of `from_tensor` must match the rank of `to_tensor`.")

    if len(from_shape) == 3:
        batch_size = from_shape[0]
        from_seq_length = from_shape[1]
        to_seq_length = to_shape[1]
    elif len(from_shape) == 2:
        if (batch_size is None or from_seq_length is None
                or to_seq_length is None):
            raise ValueError(
                "When passing in rank 2 tensors to attention_layer, the values "
                "for `batch_size`, `from_seq_length`, and `to_seq_length` "
                "must all be specified.")

    # Scalar dimensions referenced here:
    #   B = batch size (number of sequences)
    #   F = `from_tensor` sequence length
    #   T = `to_tensor` sequence length
    #   N = `num_attention_heads`
    #   H = `size_per_head`

    # 将 tensor 转为二维张量,bz * seq_len,width
    from_tensor_2d = reshape_to_matrix(from_tensor)
    to_tensor_2d = reshape_to_matrix(to_tensor)

    # `query_layer` = [B*F, N*H]
    # [bz* seq_len, width] --> [bz * seq_len, num_heads * one_head_size]
    query_layer = tf.layers.dense(
        from_tensor_2d,
        num_attention_heads * size_per_head,
        activation=query_act,
        name="query",
        kernel_initializer=create_initializer(initializer_range))

    # `key_layer` = [B*T, N*H]
    # [bz* seq_len, width] --> [bz * seq_len, num_heads * one_head_size]
    key_layer = tf.layers.dense(
        to_tensor_2d,
        num_attention_heads * size_per_head,
        activation=key_act,
        name="key",
        kernel_initializer=create_initializer(initializer_range))

    # `value_layer` = [B*T, N*H]
    # [bz* seq_len, width] --> [bz * seq_len, num_heads * one_head_size]
    value_layer = tf.layers.dense(
        to_tensor_2d,
        num_attention_heads * size_per_head,
        activation=value_act,
        name="value",
        kernel_initializer=create_initializer(initializer_range))

    # `query_layer` = [B, N, F, H]
    # [bz * seq_len, num_heads * one_head_size] --> [bz, num_heads, seq_len, one_head_size]
    query_layer = transpose_for_scores(query_layer, batch_size,
                                       num_attention_heads, from_seq_length,
                                       size_per_head)

    # `key_layer` = [B, N, T, H]
    # [bz * seq_len, num_heads * one_head_size] --> [bz, num_heads, seq_len, one_head_size]
    key_layer = transpose_for_scores(key_layer, batch_size,
                                     num_attention_heads, to_seq_length,
                                     size_per_head)

    # Take the dot product between "query" and "key" to get the raw
    # attention scores.
    # `attention_scores` = [B, N, F, T]
    # [B, N, F, H] * [B, N, T, H] = [B, N, F, T]
    # 得到self attention score
    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)

    if use_relative_position:
        assert from_seq_length == to_seq_length
        max_relative_position = 64
        # `relation_keys` = [F|T, F|T, H]
        relations_keys = _generate_relative_positions_embeddings(
            to_seq_length, size_per_head, max_relative_position, cache=False)
        relations_keys = tf.saturate_cast(relations_keys, compute_type)
        # query_layer_t is [F, B, N, H]
        query_layer_t = tf.transpose(query_layer, [2, 0, 1, 3])
        # query_layer_r is [F, B * N, H]
        query_layer_r = tf.reshape(
            query_layer_t,
            [from_seq_length, batch_size * num_attention_heads, size_per_head])
        # key_position_scores is [F, B * N, F|T]
        key_position_scores = tf.matmul(query_layer_r,
                                        relations_keys,
                                        transpose_b=True)
        # key_position_scores_r is [F, B , N, F|T]
        key_position_scores_r = tf.reshape(key_position_scores, [
            from_seq_length, batch_size, num_attention_heads, from_seq_length
        ])
        # key_position_scores_r_t is [B, N, F, F|T]
        key_position_scores_r_t = tf.transpose(key_position_scores_r,
                                               [1, 2, 0, 3])
        attention_scores = attention_scores + key_position_scores_r_t

    # 对scores进行缩放,系数为单头size的平方
    attention_scores = tf.multiply(attention_scores,
                                   1.0 / math.sqrt(float(size_per_head)))

    if attention_mask is not None:
        # `attention_mask` = [B, 1, F, T]
        # 为了维度对齐,先进行维度扩充
        attention_mask = tf.expand_dims(attention_mask, axis=[1])

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        adder = (1.0 -
                 tf.cast(attention_mask, attention_scores.dtype)) * -10000.0

        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        # 保留mask部分的score
        attention_scores += adder

    # Normalize the attention scores to probabilities.
    # `attention_probs` = [B, N, F, T]
    # 归一化后,原始mask为0部分就会趋于0
    attention_probs = tf.nn.softmax(attention_scores)

    # This is actually dropping out entire tokens to attend to, which might
    # seem a bit unusual, but is taken from the original Transformer paper.
    attention_probs = dropout(attention_probs, attention_probs_dropout_prob)

    # `value_layer` = [B, T, N, H]
    # [bz * seq_len, num_heads * one_head_size] --> [bz, seq_len, num_heads, one_head_size]
    value_layer = tf.reshape(
        value_layer,
        [batch_size, to_seq_length, num_attention_heads, size_per_head])

    # `value_layer` = [B, N, T, H]
    # [bz, seq_len, num_heads, one_head_size] --> [bz, num_heads, seq_len, one_head_size]
    value_layer = tf.transpose(value_layer, [0, 2, 1, 3])

    # `context_layer` = [B, N, F, H]
    # q 和 k 点积得到的score与v相乘,得到context
    # [B, N, F, T] * [B, N, T, H] = [B, N, F, H]
    context_layer = tf.matmul(attention_probs, value_layer)

    if use_relative_position:
        # `relation_values` = [F|T, F|T, H]
        relations_values = _generate_relative_positions_embeddings(
            to_seq_length, size_per_head, max_relative_position, cache=False)
        relations_values = tf.saturate_cast(relations_values, compute_type)
        # attention_probs_t is [F, B, N, T]
        attention_probs_t = tf.transpose(attention_probs, [2, 0, 1, 3])
        # attention_probs_r is [F, B * N, T]
        attention_probs_r = tf.reshape(
            attention_probs_t,
            [from_seq_length, batch_size * num_attention_heads, to_seq_length])
        # key_position_scores is [F, B * N, H]
        value_position_scores = tf.matmul(attention_probs_r,
                                          relations_values,
                                          transpose_b=False)
        # value_position_scores_r is [F, B , N, H]
        value_position_scores_r = tf.reshape(
            value_position_scores,
            [from_seq_length, batch_size, num_attention_heads, size_per_head])
        # value_position_scores_r_t is [B, N, F, H]
        value_position_scores_r_t = tf.transpose(value_position_scores_r,
                                                 [1, 2, 0, 3])
        # attention_scores = attention_scores + value_position_scores_r_t
        context_layer = context_layer + value_position_scores_r_t

    # `context_layer` = [B, F, N, H]
    # 将得到的context转为 bz,seq_len, num_heads, one_head_size
    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])

    if do_return_2d_tensor:
        # `context_layer` = [B*F, N*H]
        context_layer = tf.reshape(context_layer, [
            batch_size * from_seq_length, num_attention_heads * size_per_head
        ])
    else:
        # `context_layer` = [B, F, N*H]
        context_layer = tf.reshape(
            context_layer,
            [batch_size, from_seq_length, num_attention_heads * size_per_head])

    if do_return_attentions_probs:
        outputs = (context_layer, attention_probs)
    else:
        outputs = (context_layer, )
    return outputs
Esempio n. 12
0
    def __init__(self,
                 bert_config,
                 is_training,
                 input_ids,
                 input_mask=None,
                 token_type_ids=None,
                 use_one_hot_embeddings=True,
                 scope=None,
                 embedding_size=None,
                 input_embeddings=None,
                 input_reprs=None,
                 update_embeddings=True,
                 untied_embeddings=False,
                 use_fp16=False):
        """Constructor for BertModel.

        Args:
          bert_config: `BertConfig` instance.
          is_training: bool. true for training model, false for eval model. Controls
            whether dropout will be applied.
          input_ids: int32 Tensor of shape [batch_size, seq_length].
          input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
          token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
          use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
            embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
            it is much faster if this is True, on the CPU or GPU, it is faster if
            this is False.
          scope: (optional) variable scope. Defaults to "electra".

        Raises:
          ValueError: The config is invalid or one of the input tensor shapes
            is invalid.
        """
        bert_config = copy.deepcopy(bert_config)
        if not is_training:
            bert_config.hidden_dropout_prob = 0.0
            bert_config.attention_probs_dropout_prob = 0.0

        input_shape = get_shape_list(token_type_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)

        assert token_type_ids is not None

        embedding_scope = (scope
                           if untied_embeddings else "electra") + "/embeddings"

        if input_reprs is None:
            if input_embeddings is None:
                with tf.variable_scope(embedding_scope, reuse=tf.AUTO_REUSE):
                    # Perform embedding lookup on the word ids
                    if embedding_size is None:
                        embedding_size = bert_config.hidden_size
                    self.token_embeddings, self.embedding_table = embedding_lookup(
                        input_ids=input_ids,
                        vocab_size=bert_config.vocab_size,
                        embedding_size=embedding_size,
                        initializer_range=bert_config.initializer_range,
                        word_embedding_name="word_embeddings",
                        use_one_hot_embeddings=use_one_hot_embeddings)
            else:
                self.token_embeddings = input_embeddings

            with tf.variable_scope(embedding_scope, reuse=tf.AUTO_REUSE):
                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = embedding_postprocessor(
                    input_tensor=self.token_embeddings,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=bert_config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=bert_config.initializer_range,
                    max_position_embeddings=bert_config.
                    max_position_embeddings,
                    dropout_prob=bert_config.hidden_dropout_prob)
        else:
            self.embedding_output = input_reprs
        if not update_embeddings:
            self.embedding_output = tf.stop_gradient(self.embedding_output)

        with tf.variable_scope(scope, default_name="electra"):
            if self.embedding_output.shape[-1] != bert_config.hidden_size:
                self.embedding_output = tf.layers.dense(
                    self.embedding_output,
                    bert_config.hidden_size,
                    name="embeddings_project")

            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = create_attention_mask_from_input_mask(
                    token_type_ids, input_mask, use_fp16)

                # Run the stacked transformer. Output shapes
                # sequence_output: [batch_size, seq_length, hidden_size]
                # pooled_output: [batch_size, hidden_size]
                # all_encoder_layers: [n_layers, batch_size, seq_length, hidden_size].
                # attn_maps: [n_layers, batch_size, n_heads, seq_length, seq_length]
                self.all_layer_outputs, self.attn_maps = transformer_model(
                    input_tensor=tf.saturate_cast(self.embedding_output,
                                                  infer_dtype(use_fp16)),
                    attention_mask=attention_mask,
                    hidden_size=bert_config.hidden_size,
                    num_hidden_layers=bert_config.num_hidden_layers,
                    num_attention_heads=bert_config.num_attention_heads,
                    intermediate_size=bert_config.intermediate_size,
                    intermediate_act_fn=get_activation(bert_config.hidden_act),
                    hidden_dropout_prob=bert_config.hidden_dropout_prob,
                    attention_probs_dropout_prob=bert_config.
                    attention_probs_dropout_prob,
                    initializer_range=bert_config.initializer_range,
                    do_return_all_layers=True,
                    use_fp16=use_fp16)
                self.sequence_output = self.all_layer_outputs[-1]
                self.pooled_output = self.sequence_output[:, 0]
def convert_float_to_uint8(image):
    image = tf.round(image * 255)
    image = tf.saturate_cast(image, tf.uint8)
    return image
Esempio n. 14
0
def attention(
        input_tensor,
        attention_mask=None,
        hidden_size=768,
        num_attention_heads=12,
        attention_probs_dropout_prob=0.0,
        initializer_range=0.02,
        use_relative_position=False,  # nezha使用的相对位置
        do_return_attentions_probs=False):
    """
    self attention层
    :param input_tensor: bz, seq_len , hidden_size
    :param attention_mask:
    :param hidden_size:
    :param num_attention_heads:
    :param attention_probs_dropout_prob:
    :param initializer_range:
    :param use_relative_position:
    :param do_return_attentions_probs:
    :return:
    """
    def transpose(tensor, shape):
        output_tensor = tf.reshape(tensor, shape=shape)
        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
        return output_tensor

    shape = model_utils.get_shape_list(input_tensor, expected_rank=3)

    size_per_head = hidden_size // num_attention_heads
    query_layer = dense(input_tensor=input_tensor,
                        output_size=num_attention_heads * size_per_head,
                        activation=None,
                        name='query',
                        initializer_range=initializer_range)
    key_layer = dense(input_tensor=input_tensor,
                      output_size=num_attention_heads * size_per_head,
                      activation=None,
                      name='key',
                      initializer_range=initializer_range)
    value_layer = dense(input_tensor=input_tensor,
                        output_size=num_attention_heads * size_per_head,
                        activation=None,
                        name='value',
                        initializer_range=initializer_range)

    query_layer = transpose(query_layer,
                            shape=shape[:-1] +
                            [num_attention_heads, size_per_head])

    key_layer = transpose(key_layer,
                          shape=shape[:-1] +
                          [num_attention_heads, size_per_head])

    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)

    if use_relative_position:
        # `relation_keys` = [F|T, F|T, H]
        relations_keys = model_utils._generate_relative_positions_embeddings(
            shape[1], size_per_head, 64, cache=False)
        relations_keys = tf.saturate_cast(relations_keys, tf.float32)
        # query_layer_t is [F, B, N, H]
        query_layer_t = tf.transpose(query_layer, [2, 0, 1, 3])
        # query_layer_r is [F, B * N, H]
        query_layer_r = tf.reshape(
            query_layer_t,
            [shape[1], shape[0] * num_attention_heads, size_per_head])
        # key_position_scores is [F, B * N, F|T]
        key_position_scores = tf.matmul(query_layer_r,
                                        relations_keys,
                                        transpose_b=True)
        # key_position_scores_r is [F, B , N, F|T]
        key_position_scores_r = tf.reshape(
            key_position_scores,
            [shape[1], shape[0], num_attention_heads, shape[1]])
        # key_position_scores_r_t is [B, N, F, F|T]
        key_position_scores_r_t = tf.transpose(key_position_scores_r,
                                               [1, 2, 0, 3])
        attention_scores = attention_scores + key_position_scores_r_t

    # 对scores进行缩放,系数为单头size的平方
    attention_scores = tf.multiply(attention_scores,
                                   1.0 / math.sqrt(float(size_per_head)))
    if attention_mask is not None:
        # `attention_mask` = [B, 1, F, T]
        # 为了维度对齐,先进行维度扩充
        attention_mask = tf.expand_dims(attention_mask, axis=[1])

        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
        # masked positions, this operation will create a tensor which is 0.0 for
        # positions we want to attend and -10000.0 for masked positions.
        adder = (1.0 -
                 tf.cast(attention_mask, attention_scores.dtype)) * -10000.0

        # Since we are adding it to the raw scores before the softmax, this is
        # effectively the same as removing these entirely.
        # 保留mask部分的score
        attention_scores += adder

    attention_probs = tf.nn.softmax(attention_scores)

    attention_probs = model_utils.dropout(attention_probs,
                                          attention_probs_dropout_prob)

    # `value_layer` = [B, T, N, H]
    # [bz * seq_len, num_heads * one_head_size] --> [bz, seq_len, num_heads, one_head_size]
    value_layer = transpose(value_layer,
                            shape=shape[:-1] +
                            [num_attention_heads, size_per_head])

    # `context_layer` = [B, N, F, H]
    # q 和 k 点积得到的score与v相乘,得到context
    # [B, N, F, T] * [B, N, T, H] = [B, N, F, H]
    context_layer = tf.matmul(attention_probs, value_layer)

    if use_relative_position:
        # `relation_values` = [F|T, F|T, H]
        relations_values = model_utils._generate_relative_positions_embeddings(
            shape[1], size_per_head, 64, cache=False)
        relations_values = tf.saturate_cast(relations_values, tf.float32)
        # attention_probs_t is [F, B, N, T]
        attention_probs_t = tf.transpose(attention_probs, [2, 0, 1, 3])
        # attention_probs_r is [F, B * N, T]
        attention_probs_r = tf.reshape(
            attention_probs_t,
            [shape[1], shape[0] * num_attention_heads, shape[1]])
        # key_position_scores is [F, B * N, H]
        value_position_scores = tf.matmul(attention_probs_r,
                                          relations_values,
                                          transpose_b=False)
        # value_position_scores_r is [F, B , N, H]
        value_position_scores_r = tf.reshape(
            value_position_scores,
            [shape[1], shape[0], num_attention_heads, size_per_head])
        # value_position_scores_r_t is [B, N, F, H]
        value_position_scores_r_t = tf.transpose(value_position_scores_r,
                                                 [1, 2, 0, 3])
        # attention_scores = attention_scores + value_position_scores_r_t
        context_layer = context_layer + value_position_scores_r_t

    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])

    context_layer = tf.reshape(context_layer,
                               shape=shape[:2] +
                               [num_attention_heads * size_per_head])

    if do_return_attentions_probs:
        outputs = (context_layer, attention_probs)
    else:
        outputs = (context_layer, )
    return outputs
Esempio n. 15
0
def post_process_gradients(grads_and_vars, summaries, lr, clip_gradients,
                           larc_params):
    """Applies post processing to gradients, i.e. clipping, LARC, summaries."""
    if 'global_gradient_norm' in summaries:
        tf.summary.scalar('global_gradient_norm',
                          _global_norm_with_cast(grads_and_vars))

    # Optionally clip gradients by global norm.
    if clip_gradients is not None:
        grads_and_vars = _clip_gradients_by_norm(grads_and_vars,
                                                 clip_gradients)

    # Add histograms for variables, gradients and gradient norms.

    if 'global_gradient_norm' in summaries:
        for gradient, variable in grads_and_vars:
            if isinstance(gradient, tf.IndexedSlices):
                grad_values = gradient.values
            else:
                grad_values = gradient

            if isinstance(variable, tf.IndexedSlices):
                var_values = variable.values
            else:
                var_values = variable

            if grad_values is not None:
                var_name = variable.name.replace(':', '_')
                if 'gradients' in summaries:
                    # need to mask nans for automatic loss scaling
                    tf.summary.histogram('gradients/%s' % var_name,
                                         mask_nans(grad_values))
                if 'gradient_norm' in summaries:
                    tf.summary.scalar('gradient_norm/%s' % var_name,
                                      tf.norm(grad_values))
                if 'variables' in summaries:
                    tf.summary.histogram('variables/%s' % var_name, var_values)
                if 'variable_norm' in summaries:
                    tf.summary.scalar('variable_norm/%s' % var_name,
                                      tf.norm(var_values))

    if clip_gradients is not None and 'global_gradient_norm' in summaries:
        tf.summary.scalar(
            'global_clipped_gradient_norm',
            _global_norm_with_cast(grads_and_vars),
        )

    # LARC gradient re-scaling
    if larc_params is not None:
        check_params(
            config=larc_params,
            required_dict={'larc_eta': float},
            optional_dict={
                'larc_mode': ['clip', 'scale'],
                'min_update': float,
                'epsilon': float,
            },
        )
        larc_eta = larc_params['larc_eta']
        larc_mode = larc_params.get('larc_mode', 'clip')
        min_update = larc_params.get('min_update', 1e-7)
        eps = larc_params.get('epsilon', 1e-7)

        grads_and_vars_larc = [None] * len(grads_and_vars)
        for idx, (g, v) in enumerate(grads_and_vars):
            var_dtype = v.dtype
            v_norm = tf.norm(tensor=tf.cast(v, tf.float32), ord=2)
            g_norm = tf.norm(tensor=tf.cast(g, tf.float32), ord=2)

            if larc_mode == 'clip':
                larc_grad_update = tf.maximum(
                    larc_eta * v_norm / (lr * (g_norm + eps)), min_update)
                if 'larc_summaries' in summaries:
                    tf.summary.scalar(
                        'larc_clip_on/{}'.format(v.name),
                        tf.cast(tf.less(larc_grad_update, 1.0), tf.int32),
                    )
                larc_grad_update = tf.minimum(larc_grad_update, 1.0)
            else:
                larc_grad_update = tf.maximum(
                    larc_eta * v_norm / (g_norm + eps), min_update)
            larc_grad_update = tf.saturate_cast(larc_grad_update, var_dtype)
            grads_and_vars_larc[idx] = (larc_grad_update * g, v)

            # adding additional summary
            if 'larc_summaries' in summaries:
                tf.summary.scalar('larc_grad_update/{}'.format(v.name),
                                  larc_grad_update)
                tf.summary.scalar(
                    'larc_final_lr/{}'.format(v.name),
                    tf.cast(lr, var_dtype) * larc_grad_update,
                )
        grads_and_vars = grads_and_vars_larc
    return grads_and_vars
Esempio n. 16
0
def quantize_image(image):
  image = tf.round(image * 255)
  image = tf.saturate_cast(image, tf.uint8)
  return image
Esempio n. 17
0
def quantize_image(image):
    image = tf.math.floor(image + 0.5)
    image = tf.saturate_cast(image, tf.uint8)
    return image