Example #1
0
    def add_variable(self, name, shape=None, **kwargs):
        variable = super().add_variable(lottery.weight_name_of_base_name(name),
                                        shape, **kwargs)

        mask = super().add_variable(lottery.mask_name_of_base_name(name),
                                    shape,
                                    trainable=False,
                                    initializer=tf.initializers.ones())
        return tf.math.multiply(variable, mask)
Example #2
0
def conv2d_fixed_padding(inputs, filters, kernel_size, strides, data_format):
    """Strided 2-D convolution with explicit padding."""
    # The padding is consistent and is based only on `kernel_size`, not on the
    # dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).

    global _layer_idx, _proj_idx

    if strides > 1:
        inputs = fixed_padding(inputs, kernel_size, data_format)

    if data_format == 'channels_last':
        input_dim = inputs.shape[-1]
        channel_format = 'NHWC'
        strides_tuple = [1, strides, strides, 1]
    else:
        input_dim = inputs.shape[1]
        channel_format = 'NCHW'
        strides_tuple = [1, 1, strides, strides]

    kernel_shape = (kernel_size, kernel_size, input_dim, filters)

    base_name = ('conv_{layer_idx}'
                 if kernel_size > 1 else 'proj_{layer_idx}_{proj_idx}').format(
                     layer_idx=_layer_idx,
                     proj_idx=_proj_idx,
                 )

    kernel = tf.get_variable(
        initializer=tf.compat.v1.variance_scaling_initializer(),
        trainable=True,
        shape=kernel_shape,
        dtype=inputs.dtype,
        name=lottery.weight_name_of_base_name(base_name))

    mask = tf.get_variable(
        initializer=tf.ones(kernel_shape, dtype=inputs.dtype),
        trainable=False,
        dtype=inputs.dtype,
        name=lottery.mask_name_of_base_name(base_name),
    )

    filters = tf.math.multiply(kernel, mask)

    outputs = tf.nn.conv2d(
        inputs,
        filters,
        strides_tuple,
        padding=('SAME' if strides == 1 else 'VALID'),
        data_format=channel_format,
    )

    _proj_idx += 1

    if kernel_size > 1:
        _layer_idx += 1

    return outputs
  def build_graph(self, hparams, scope=None):
    """Subclass must implement this method.

    Creates a sequence-to-sequence model with dynamic RNN decoder API.
    Args:
      hparams: Hyperparameter configurations.
      scope: VariableScope for the created subgraph; default "dynamic_seq2seq".

    Returns:
      A tuple of the form (logits, predicted_ids) for infererence and
      (loss, None) for training.
      where:
        logits: float32 Tensor [batch_size x num_decoder_symbols]
        loss: float32 scalar
        predicted_ids: predicted ids from beam search.

    Raises:
      ValueError: if encoder_type differs from mono and bi, or
        attention_option is not (luong | scaled_luong |
        bahdanau | normed_bahdanau).
    """
    utils.print_out("# Creating %s graph ..." % self.mode)

    # Projection
    with tf.variable_scope(scope or "build_network"):
      with tf.variable_scope("decoder/output_projection"):

        output_layer = tf.get_variable(
            lottery.weight_name_of_base_name("output_projection"), [self.num_units, self.tgt_vocab_size])
        output_layer_mask = tf.get_variable(
            lottery.mask_name_of_base_name("output_projection"), [self.num_units, self.tgt_vocab_size],
          trainable=False, initializer=tf.initializers.ones()
        )
        self.output_layer = tf.math.multiply(output_layer, output_layer_mask)

    with tf.variable_scope(scope or "dynamic_seq2seq", dtype=self.dtype):
      if hparams.activation_dtype == "bfloat16":
        tf.get_variable_scope().set_custom_getter(
            utils.bfloat16_var_getter if hparams.activation_dtype == "bfloat16"
            else None)
        logits_or_loss, decoder_cell_outputs, predicted_ids = self._build_model(
            hparams)
        if decoder_cell_outputs is not None:
          decoder_cell_outputs = tf.cast(decoder_cell_outputs, tf.float32)
      else:
        logits_or_loss, decoder_cell_outputs, predicted_ids = self._build_model(
            hparams)

    return logits_or_loss, predicted_ids
Example #4
0
def _create_or_load_embed(embed_name, vocab_file, embed_file, vocab_size,
                          embed_size, dtype):
    """Create a new or load an existing embedding matrix."""
    if vocab_file and embed_file:
        embedding = _create_pretrained_emb_from_txt(vocab_file, embed_file)
    else:
        embedding = tf.get_variable(
            lottery.weight_name_of_base_name(embed_name),
            [vocab_size, embed_size], dtype)
        embedding_mask = tf.get_variable(
            lottery.mask_name_of_base_name(embed_name),
            [vocab_size, embed_size],
            dtype,
            trainable=False,
            initializer=tf.initializers.ones())
        embedding = tf.math.multiply(embedding, embedding_mask)
    return embedding
Example #5
0
    def __call__(self, inputs, training):
        """Add operations to classify a batch of input images.

    Args:
      inputs: A Tensor representing a batch of input images.
      training: A boolean. Set to True to add operations required only when
        training the classifier.

    Returns:
      A logits Tensor with shape [<batch_size>, self.num_classes].
    """
        global _layer_idx, _proj_idx
        _layer_idx = 1
        _proj_idx = 1

        with self._model_variable_scope():
            if self.data_format == 'channels_first':
                # Convert the inputs from channels_last (NHWC) to channels_first (NCHW).
                # This provides a large performance boost on GPU. See
                # https://www.tensorflow.org/performance/performance_guide#data_formats
                inputs = tf.transpose(inputs, [0, 3, 1, 2])

            inputs = conv2d_fixed_padding(inputs=inputs,
                                          filters=self.num_filters,
                                          kernel_size=self.kernel_size,
                                          strides=self.conv_stride,
                                          data_format=self.data_format)
            inputs = tf.identity(inputs, 'initial_conv')

            # We do not include batch normalization or activation functions in V2
            # for the initial conv1 because the first ResNet unit will perform these
            # for both the shortcut and non-shortcut paths as part of the first
            # block's projection. Cf. Appendix of [2].
            if self.resnet_version == 1:
                inputs = batch_norm(inputs, training, self.data_format)
                inputs = tf.nn.relu(inputs)

            if self.first_pool_size:
                inputs = tf.layers.max_pooling2d(
                    inputs=inputs,
                    pool_size=self.first_pool_size,
                    strides=self.first_pool_stride,
                    padding='SAME',
                    data_format=self.data_format)
                inputs = tf.identity(inputs, 'initial_max_pool')

            for i, num_blocks in enumerate(self.block_sizes):
                num_filters = self.num_filters * (2**i)
                inputs = block_layer(inputs=inputs,
                                     filters=num_filters,
                                     bottleneck=self.bottleneck,
                                     block_fn=self.block_fn,
                                     blocks=num_blocks,
                                     strides=self.block_strides[i],
                                     training=training,
                                     name='block_layer{}'.format(i + 1),
                                     data_format=self.data_format)

            # Only apply the BN and ReLU for model that does pre_activation in each
            # building/bottleneck block, eg resnet V2.
            if self.pre_activation:
                inputs = batch_norm(inputs, training, self.data_format)
                inputs = tf.nn.relu(inputs)

            # The current top layer has shape
            # `batch_size x pool_size x pool_size x final_size`.
            # ResNet does an Average Pooling layer over pool_size,
            # but that is the same as doing a reduce_mean. We do a reduce_mean
            # here because it performs better than AveragePooling2D.
            axes = [2, 3] if self.data_format == 'channels_first' else [1, 2]
            inputs = tf.reduce_mean(inputs, axes, keepdims=True)
            inputs = tf.identity(inputs, 'final_reduce_mean')

            inputs = tf.squeeze(inputs, axes)

            dense_w = tf.get_variable(
                initializer=tf.random_normal_initializer(stddev=.01),
                trainable=True,
                shape=(inputs.shape[-1], self.num_classes),
                dtype=inputs.dtype,
                name=lottery.weight_name_of_base_name('dense'))

            dense_b = tf.get_variable(
                initializer=tf.zeros_initializer(),
                trainable=True,
                shape=(self.num_classes, ),
                dtype=inputs.dtype,
                name=lottery.bias_name_of_base_name('dense'))

            dense_mask = tf.get_variable(
                initializer=tf.ones(dense_w.shape, dtype=inputs.dtype),
                trainable=False,
                dtype=inputs.dtype,
                name=lottery.mask_name_of_base_name('dense'))

            inputs = inputs @ (dense_w * dense_mask) + dense_b

            inputs = tf.identity(inputs, 'final_dense')
            return inputs
    def model(inputs, is_training):
        """Creation of the model graph."""
        inputs = conv2d_fixed_padding(inputs=inputs,
                                      filters=64,
                                      kernel_size=7,
                                      strides=2,
                                      data_format=data_format)
        inputs = tf.identity(inputs, 'initial_conv')
        inputs = batch_norm_relu(inputs, is_training, data_format=data_format)

        inputs = tf.layers.max_pooling2d(inputs=inputs,
                                         pool_size=3,
                                         strides=2,
                                         padding='SAME',
                                         data_format=data_format)
        inputs = tf.identity(inputs, 'initial_max_pool')

        inputs = block_group(inputs=inputs,
                             filters=64,
                             block_fn=block_fn,
                             blocks=layers[0],
                             strides=1,
                             is_training=is_training,
                             name='block_group1',
                             data_format=data_format,
                             dropblock_keep_prob=dropblock_keep_probs[0],
                             dropblock_size=dropblock_size)
        inputs = block_group(inputs=inputs,
                             filters=128,
                             block_fn=block_fn,
                             blocks=layers[1],
                             strides=2,
                             is_training=is_training,
                             name='block_group2',
                             data_format=data_format,
                             dropblock_keep_prob=dropblock_keep_probs[1],
                             dropblock_size=dropblock_size)
        inputs = block_group(inputs=inputs,
                             filters=256,
                             block_fn=block_fn,
                             blocks=layers[2],
                             strides=2,
                             is_training=is_training,
                             name='block_group3',
                             data_format=data_format,
                             dropblock_keep_prob=dropblock_keep_probs[2],
                             dropblock_size=dropblock_size)
        inputs = block_group(inputs=inputs,
                             filters=512,
                             block_fn=block_fn,
                             blocks=layers[3],
                             strides=2,
                             is_training=is_training,
                             name='block_group4',
                             data_format=data_format,
                             dropblock_keep_prob=dropblock_keep_probs[3],
                             dropblock_size=dropblock_size)

        # The activation is 7x7 so this is a global average pool.
        # TODO(huangyp): reduce_mean will be faster.
        pool_size = (inputs.shape[1], inputs.shape[2])
        inputs = tf.layers.average_pooling2d(inputs=inputs,
                                             pool_size=pool_size,
                                             strides=1,
                                             padding='VALID',
                                             data_format=data_format)
        inputs = tf.identity(inputs, 'final_avg_pool')
        final_width = 2048 if block_fn is bottleneck_block else 512
        inputs = tf.reshape(inputs, [-1, final_width])

        dense_w = tf.get_variable(
            initializer=tf.random_normal_initializer(stddev=.01),
            trainable=True,
            shape=(final_width, num_classes),
            dtype=inputs.dtype,
            name=lottery.weight_name_of_base_name('dense'))

        dense_b = tf.get_variable(initializer=tf.zeros_initializer(),
                                  trainable=True,
                                  shape=(num_classes, ),
                                  dtype=inputs.dtype,
                                  name=lottery.bias_name_of_base_name('dense'))

        dense_mask = tf.get_variable(
            initializer=lambda: tf.ones(dense_w.shape, dtype=inputs.dtype),
            trainable=False,
            dtype=inputs.dtype,
            name=lottery.mask_name_of_base_name('dense'))

        inputs = inputs @ (dense_w * dense_mask) + dense_b
        inputs = tf.identity(inputs, 'final_dense')
        return inputs
def conv2d_fixed_padding(inputs,
                         filters,
                         kernel_size,
                         strides,
                         data_format='channels_first',
                         projection=False):
    """Strided 2-D convolution with explicit padding.

  The padding is consistent and is based only on `kernel_size`, not on the
  dimensions of `inputs` (as opposed to using `tf.layers.conv2d` alone).

  Args:
    inputs: `Tensor` of size `[batch, channels, height_in, width_in]`.
    filters: `int` number of filters in the convolution.
    kernel_size: `int` size of the kernel to be used in the convolution.
    strides: `int` strides of the convolution.
    data_format: `str` either "channels_first" for `[batch, channels, height,
        width]` or "channels_last for `[batch, height, width, channels]`.

  Returns:
    A `Tensor` of shape `[batch, filters, height_out, width_out]`.
  """
    global _layer_idx, _proj_idx
    if strides > 1:
        inputs = fixed_padding(inputs, kernel_size, data_format)

    if data_format == 'channels_last':
        input_dim = inputs.shape[-1]
        channel_format = 'NHWC'
        strides_tuple = [1, strides, strides, 1]
    else:
        input_dim = inputs.shape[1]
        channel_format = 'NCHW'
        strides_tuple = [1, 1, strides, strides]

    kernel_shape = (kernel_size, kernel_size, input_dim, filters)

    base_name = ('conv_{layer_idx}'
                 if not projection else 'proj_{layer_idx}_{proj_idx}').format(
                     layer_idx=_layer_idx,
                     proj_idx=_proj_idx,
                 )

    kernel = tf.get_variable(
        initializer=tf.compat.v1.variance_scaling_initializer(),
        trainable=True,
        shape=kernel_shape,
        dtype=inputs.dtype,
        name=lottery.weight_name_of_base_name(base_name))

    mask = tf.get_variable(
        initializer=lambda: tf.ones(kernel_shape, dtype=inputs.dtype),
        trainable=False,
        dtype=inputs.dtype,
        name=lottery.mask_name_of_base_name(base_name),
    )

    filters = tf.math.multiply(kernel, mask)

    outputs = tf.compat.v1.nn.conv2d(
        inputs,
        filters,
        strides_tuple,
        padding=('SAME' if strides == 1 else 'VALID'),
        data_format=channel_format,
    )

    _proj_idx += 1

    if not projection:
        _layer_idx += 1

    return outputs