Example #1
0
    def __init__(self,
                 config: GroverConfig,
                 is_training,
                 input_ids,
                 cache=None,
                 do_cache=False,
                 pad_token_id=0,
                 chop_off_last_token=True,
                 scope=None,
                 reuse=False):
        """
        :param config:
        :param is_training:
        :param input_ids: Tensor thats of size [batch_size, seq_length]
        :param cache: Optionally, a tensor to use that will contain cached information of the size
            [batch_size, num_layers, 2, num_heads, cache_length, features]
        :param do_cache: Whether to cache again.
        :param pad_token_id: Which token will be used for padding (probably 0.)
        :param chop_off_last_token: True if we will end up using this for TRAINING only. False if we want to generate.
                                    it means the last token in input_ids will not be processed by the model as input
        :param scope: scope to run this on
        """
        self.config = copy.deepcopy(config)
        self.is_training = is_training
        self.pad_token_id = pad_token_id

        if not is_training:
            self.config.hidden_dropout_prob = 0.0
            self.config.attention_probs_dropout_prob = 0.0

        if chop_off_last_token:
            self.target_ids = input_ids[:, 1:]
            self.input_ids = input_ids[:, :-1]
        else:
            self.input_ids = input_ids
            self.target_ids = tf.concat((input_ids[:, 1:],
                                         tf.constant(self.pad_token_id, dtype=self.input_ids.dtype,
                                                     shape=[get_shape_list(self.input_ids, 2)[0], 1])), 1)

        self.batch_size, self.seq_length = get_shape_list(self.input_ids, 2)

        if cache is None:
            caches = [None] * config.num_hidden_layers
            self.cache_length = 0
        else:
            batch_size_, num_layers_, two_, num_heads_, self.cache_length, features_ = get_shape_list(
                cache, expected_rank=6)
            assert batch_size_ == self.batch_size
            assert num_layers_ == config.num_hidden_layers
            assert two_ == 2
            assert num_heads_ == config.num_attention_heads
            assert features_ == (config.hidden_size // config.num_attention_heads)
            caches = tf.unstack(cache, axis=1)

        with tf.variable_scope(scope, default_name='newslm', reuse=reuse):
            with tf.variable_scope("embeddings"):
                embeddings, self.embedding_table = embed(self.input_ids, config.vocab_size,
                                                         config.hidden_size,
                                                         position_offset=self.cache_length,
                                                         initializer_range=config.initializer_range,
                                                         max_position_embeddings=config.max_position_embeddings,
                                                         use_one_hot_embeddings=True)

            mask = get_attention_mask(self.seq_length, self.seq_length + self.cache_length, dtype=embeddings.dtype)

            # We keep the representation as a 2D tensor to avoid re-shaping it back and
            # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
            # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
            # help the optimizer.
            hidden_state = tf.reshape(embeddings, [self.batch_size * self.seq_length, self.config.hidden_size])
            new_kvs = []
            for layer_idx, layer_cache in enumerate(caches):
                with tf.variable_scope('layer{:02d}'.format(layer_idx)):
                    # [batch_size * seq_length, hidden_size]
                    attention_output, new_kv = attention_layer(
                        hidden_state,
                        mask,
                        batch_size=self.batch_size,
                        seq_length=self.seq_length,
                        size_per_head=config.hidden_size // config.num_attention_heads,
                        num_attention_heads=config.num_attention_heads,
                        initializer_range=config.initializer_range,
                        hidden_dropout_prob=self.config.hidden_dropout_prob,
                        attention_probs_dropout_prob=self.config.attention_probs_dropout_prob,
                        do_cache=do_cache,
                        cache=layer_cache,
                    )
                    new_kvs.append(new_kv)

                    # [batch_size * seq_length, hidden_size]
                    hidden_state = residual_mlp_layer(hidden_state + attention_output,
                                                      intermediate_size=config.intermediate_size,
                                                      hidden_dropout_prob=self.config.hidden_dropout_prob)
            self.hidden_state = hidden_state

        self.new_kvs = tf.stack(new_kvs, axis=1) if do_cache else None

        # Note that the hidden state is still flat (batch_size*hidden_size)
        self.logits_flat = tf.matmul(self.hidden_state, self.embedding_table, transpose_b=True)
Example #2
0
def inception_v3(inputs,
                 dropout_keep_prob=0.8,
                 num_classes=1000,
                 is_training=True,
                 restore_logits=True,
                 scope=''):
    """Latest Inception from http://arxiv.org/abs/1512.00567.

    "Rethinking the Inception Architecture for Computer Vision"

    Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens,
    Zbigniew Wojna

  Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    dropout_keep_prob: dropout keep_prob.
    num_classes: number of predicted classes.
    is_training: whether is training or not.
    restore_logits: whether or not the logits layers should be restored.
      Useful for fine-tuning a model with different num_classes.
    scope: Optional scope for name_scope.

  Returns:
    a list containing 'logits', 'aux_logits' Tensors.
  """
    # end_points will collect relevant activations for external use, for example
    # summaries or losses.
    end_points = {}
    with tf.name_scope(scope, 'inception_v3', [inputs]):
        with scopes.arg_scope(
            [ops.conv2d, ops.fc, ops.batch_norm, ops.dropout],
                is_training=is_training):
            with scopes.arg_scope([ops.conv2d, ops.max_pool, ops.avg_pool],
                                  stride=1,
                                  padding='VALID'):
                # 299 x 299 x 3
                end_points['conv0'] = ops.conv2d(inputs,
                                                 32, [3, 3],
                                                 stride=2,
                                                 scope='conv0')
                # 149 x 149 x 32
                end_points['conv1'] = ops.conv2d(end_points['conv0'],
                                                 32, [3, 3],
                                                 scope='conv1')
                # 147 x 147 x 32
                end_points['conv2'] = ops.conv2d(end_points['conv1'],
                                                 64, [3, 3],
                                                 padding='SAME',
                                                 scope='conv2')
                # 147 x 147 x 64
                end_points['pool1'] = ops.max_pool(end_points['conv2'], [3, 3],
                                                   stride=2,
                                                   scope='pool1')
                # 73 x 73 x 64
                end_points['conv3'] = ops.conv2d(end_points['pool1'],
                                                 80, [1, 1],
                                                 scope='conv3')
                # 73 x 73 x 80.
                end_points['conv4'] = ops.conv2d(end_points['conv3'],
                                                 192, [3, 3],
                                                 scope='conv4')
                # 71 x 71 x 192.
                end_points['pool2'] = ops.max_pool(end_points['conv4'], [3, 3],
                                                   stride=2,
                                                   scope='pool2')
                # 35 x 35 x 192.
                net = end_points['pool2']
            # Inception blocks
            with scopes.arg_scope([ops.conv2d, ops.max_pool, ops.avg_pool],
                                  stride=1,
                                  padding='SAME'):
                # mixed: 35 x 35 x 256.
                with tf.variable_scope('mixed_35x35x256a'):
                    with tf.variable_scope('branch1x1'):
                        branch1x1 = ops.conv2d(net, 64, [1, 1])
                    with tf.variable_scope('branch5x5'):
                        branch5x5 = ops.conv2d(net, 48, [1, 1])
                        branch5x5 = ops.conv2d(branch5x5, 64, [5, 5])
                    with tf.variable_scope('branch3x3dbl'):
                        branch3x3dbl = ops.conv2d(net, 64, [1, 1])
                        branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3])
                        branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3])
                    with tf.variable_scope('branch_pool'):
                        branch_pool = ops.avg_pool(net, [3, 3])
                        branch_pool = ops.conv2d(branch_pool, 32, [1, 1])
                    net = tf.concat(
                        [branch1x1, branch5x5, branch3x3dbl, branch_pool], 3)
                    end_points['mixed_35x35x256a'] = net
                # mixed_1: 35 x 35 x 288.
                with tf.variable_scope('mixed_35x35x288a'):
                    with tf.variable_scope('branch1x1'):
                        branch1x1 = ops.conv2d(net, 64, [1, 1])
                    with tf.variable_scope('branch5x5'):
                        branch5x5 = ops.conv2d(net, 48, [1, 1])
                        branch5x5 = ops.conv2d(branch5x5, 64, [5, 5])
                    with tf.variable_scope('branch3x3dbl'):
                        branch3x3dbl = ops.conv2d(net, 64, [1, 1])
                        branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3])
                        branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3])
                    with tf.variable_scope('branch_pool'):
                        branch_pool = ops.avg_pool(net, [3, 3])
                        branch_pool = ops.conv2d(branch_pool, 64, [1, 1])
                    net = tf.concat(
                        [branch1x1, branch5x5, branch3x3dbl, branch_pool], 3)
                    end_points['mixed_35x35x288a'] = net
                # mixed_2: 35 x 35 x 288.
                with tf.variable_scope('mixed_35x35x288b'):
                    with tf.variable_scope('branch1x1'):
                        branch1x1 = ops.conv2d(net, 64, [1, 1])
                    with tf.variable_scope('branch5x5'):
                        branch5x5 = ops.conv2d(net, 48, [1, 1])
                        branch5x5 = ops.conv2d(branch5x5, 64, [5, 5])
                    with tf.variable_scope('branch3x3dbl'):
                        branch3x3dbl = ops.conv2d(net, 64, [1, 1])
                        branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3])
                        branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3])
                    with tf.variable_scope('branch_pool'):
                        branch_pool = ops.avg_pool(net, [3, 3])
                        branch_pool = ops.conv2d(branch_pool, 64, [1, 1])
                    net = tf.concat(
                        [branch1x1, branch5x5, branch3x3dbl, branch_pool], 3)
                    end_points['mixed_35x35x288b'] = net
                # mixed_3: 17 x 17 x 768.
                with tf.variable_scope('mixed_17x17x768a'):
                    with tf.variable_scope('branch3x3'):
                        branch3x3 = ops.conv2d(net,
                                               384, [3, 3],
                                               stride=2,
                                               padding='VALID')
                    with tf.variable_scope('branch3x3dbl'):
                        branch3x3dbl = ops.conv2d(net, 64, [1, 1])
                        branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3])
                        branch3x3dbl = ops.conv2d(branch3x3dbl,
                                                  96, [3, 3],
                                                  stride=2,
                                                  padding='VALID')
                    with tf.variable_scope('branch_pool'):
                        branch_pool = ops.max_pool(net, [3, 3],
                                                   stride=2,
                                                   padding='VALID')
                    net = tf.concat([branch3x3, branch3x3dbl, branch_pool], 3)
                    end_points['mixed_17x17x768a'] = net
                # mixed4: 17 x 17 x 768.
                with tf.variable_scope('mixed_17x17x768b'):
                    with tf.variable_scope('branch1x1'):
                        branch1x1 = ops.conv2d(net, 192, [1, 1])
                    with tf.variable_scope('branch7x7'):
                        branch7x7 = ops.conv2d(net, 128, [1, 1])
                        branch7x7 = ops.conv2d(branch7x7, 128, [1, 7])
                        branch7x7 = ops.conv2d(branch7x7, 192, [7, 1])
                    with tf.variable_scope('branch7x7dbl'):
                        branch7x7dbl = ops.conv2d(net, 128, [1, 1])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 128, [7, 1])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 128, [1, 7])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 128, [7, 1])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7])
                    with tf.variable_scope('branch_pool'):
                        branch_pool = ops.avg_pool(net, [3, 3])
                        branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
                    net = tf.concat(
                        [branch1x1, branch7x7, branch7x7dbl, branch_pool], 3)
                    end_points['mixed_17x17x768b'] = net
                # mixed_5: 17 x 17 x 768.
                with tf.variable_scope('mixed_17x17x768c'):
                    with tf.variable_scope('branch1x1'):
                        branch1x1 = ops.conv2d(net, 192, [1, 1])
                    with tf.variable_scope('branch7x7'):
                        branch7x7 = ops.conv2d(net, 160, [1, 1])
                        branch7x7 = ops.conv2d(branch7x7, 160, [1, 7])
                        branch7x7 = ops.conv2d(branch7x7, 192, [7, 1])
                    with tf.variable_scope('branch7x7dbl'):
                        branch7x7dbl = ops.conv2d(net, 160, [1, 1])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [1, 7])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7])
                    with tf.variable_scope('branch_pool'):
                        branch_pool = ops.avg_pool(net, [3, 3])
                        branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
                    net = tf.concat(
                        [branch1x1, branch7x7, branch7x7dbl, branch_pool], 3)
                    end_points['mixed_17x17x768c'] = net
                # mixed_6: 17 x 17 x 768.
                with tf.variable_scope('mixed_17x17x768d'):
                    with tf.variable_scope('branch1x1'):
                        branch1x1 = ops.conv2d(net, 192, [1, 1])
                    with tf.variable_scope('branch7x7'):
                        branch7x7 = ops.conv2d(net, 160, [1, 1])
                        branch7x7 = ops.conv2d(branch7x7, 160, [1, 7])
                        branch7x7 = ops.conv2d(branch7x7, 192, [7, 1])
                    with tf.variable_scope('branch7x7dbl'):
                        branch7x7dbl = ops.conv2d(net, 160, [1, 1])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [1, 7])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7])
                    with tf.variable_scope('branch_pool'):
                        branch_pool = ops.avg_pool(net, [3, 3])
                        branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
                    net = tf.concat(
                        [branch1x1, branch7x7, branch7x7dbl, branch_pool], 3)
                    end_points['mixed_17x17x768d'] = net
                # mixed_7: 17 x 17 x 768.
                with tf.variable_scope('mixed_17x17x768e'):
                    with tf.variable_scope('branch1x1'):
                        branch1x1 = ops.conv2d(net, 192, [1, 1])
                    with tf.variable_scope('branch7x7'):
                        branch7x7 = ops.conv2d(net, 192, [1, 1])
                        branch7x7 = ops.conv2d(branch7x7, 192, [1, 7])
                        branch7x7 = ops.conv2d(branch7x7, 192, [7, 1])
                    with tf.variable_scope('branch7x7dbl'):
                        branch7x7dbl = ops.conv2d(net, 192, [1, 1])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [7, 1])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [7, 1])
                        branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7])
                    with tf.variable_scope('branch_pool'):
                        branch_pool = ops.avg_pool(net, [3, 3])
                        branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
                    net = tf.concat(
                        [branch1x1, branch7x7, branch7x7dbl, branch_pool], 3)
                    end_points['mixed_17x17x768e'] = net
                # Auxiliary Head logits
                aux_logits = tf.identity(end_points['mixed_17x17x768e'])
                with tf.variable_scope('aux_logits'):
                    aux_logits = ops.avg_pool(aux_logits, [5, 5],
                                              stride=3,
                                              padding='VALID')
                    aux_logits = ops.conv2d(aux_logits,
                                            128, [1, 1],
                                            scope='proj')
                    # Shape of feature map before the final layer.
                    shape = aux_logits.get_shape()
                    aux_logits = ops.conv2d(aux_logits,
                                            768,
                                            shape[1:3],
                                            stddev=0.01,
                                            padding='VALID')
                    aux_logits = ops.flatten(aux_logits)
                    aux_logits = ops.fc(aux_logits,
                                        num_classes,
                                        activation=None,
                                        stddev=0.001,
                                        restore=restore_logits)
                    end_points['aux_logits'] = aux_logits
                # mixed_8: 8 x 8 x 1280.
                # Note that the scope below is not changed to not void previous
                # checkpoints.
                # (TODO) Fix the scope when appropriate.
                with tf.variable_scope('mixed_17x17x1280a'):
                    with tf.variable_scope('branch3x3'):
                        branch3x3 = ops.conv2d(net, 192, [1, 1])
                        branch3x3 = ops.conv2d(branch3x3,
                                               320, [3, 3],
                                               stride=2,
                                               padding='VALID')
                    with tf.variable_scope('branch7x7x3'):
                        branch7x7x3 = ops.conv2d(net, 192, [1, 1])
                        branch7x7x3 = ops.conv2d(branch7x7x3, 192, [1, 7])
                        branch7x7x3 = ops.conv2d(branch7x7x3, 192, [7, 1])
                        branch7x7x3 = ops.conv2d(branch7x7x3,
                                                 192, [3, 3],
                                                 stride=2,
                                                 padding='VALID')
                    with tf.variable_scope('branch_pool'):
                        branch_pool = ops.max_pool(net, [3, 3],
                                                   stride=2,
                                                   padding='VALID')
                    net = tf.concat([branch3x3, branch7x7x3, branch_pool], 3)
                    end_points['mixed_17x17x1280a'] = net
                # mixed_9: 8 x 8 x 2048.
                with tf.variable_scope('mixed_8x8x2048a'):
                    with tf.variable_scope('branch1x1'):
                        branch1x1 = ops.conv2d(net, 320, [1, 1])
                    with tf.variable_scope('branch3x3'):
                        branch3x3 = ops.conv2d(net, 384, [1, 1])
                        branch3x3 = tf.concat([
                            ops.conv2d(branch3x3, 384, [1, 3]),
                            ops.conv2d(branch3x3, 384, [3, 1])
                        ], 3)
                    with tf.variable_scope('branch3x3dbl'):
                        branch3x3dbl = ops.conv2d(net, 448, [1, 1])
                        branch3x3dbl = ops.conv2d(branch3x3dbl, 384, [3, 3])
                        branch3x3dbl = tf.concat([
                            ops.conv2d(branch3x3dbl, 384, [1, 3]),
                            ops.conv2d(branch3x3dbl, 384, [3, 1])
                        ], 3)
                    with tf.variable_scope('branch_pool'):
                        branch_pool = ops.avg_pool(net, [3, 3])
                        branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
                    net = tf.concat(
                        [branch1x1, branch3x3, branch3x3dbl, branch_pool], 3)
                    end_points['mixed_8x8x2048a'] = net
                # mixed_10: 8 x 8 x 2048.
                with tf.variable_scope('mixed_8x8x2048b'):
                    with tf.variable_scope('branch1x1'):
                        branch1x1 = ops.conv2d(net, 320, [1, 1])
                    with tf.variable_scope('branch3x3'):
                        branch3x3 = ops.conv2d(net, 384, [1, 1])
                        branch3x3 = tf.concat([
                            ops.conv2d(branch3x3, 384, [1, 3]),
                            ops.conv2d(branch3x3, 384, [3, 1])
                        ], 3)
                    with tf.variable_scope('branch3x3dbl'):
                        branch3x3dbl = ops.conv2d(net, 448, [1, 1])
                        branch3x3dbl = ops.conv2d(branch3x3dbl, 384, [3, 3])
                        branch3x3dbl = tf.concat([
                            ops.conv2d(branch3x3dbl, 384, [1, 3]),
                            ops.conv2d(branch3x3dbl, 384, [3, 1])
                        ], 3)
                    with tf.variable_scope('branch_pool'):
                        branch_pool = ops.avg_pool(net, [3, 3])
                        branch_pool = ops.conv2d(branch_pool, 192, [1, 1])
                    net = tf.concat(
                        [branch1x1, branch3x3, branch3x3dbl, branch_pool], 3)
                    end_points['mixed_8x8x2048b'] = net
                # Final pooling and prediction
                with tf.variable_scope('logits'):
                    shape = net.get_shape()
                    net = ops.avg_pool(net,
                                       shape[1:3],
                                       padding='VALID',
                                       scope='pool')
                    # 1 x 1 x 2048
                    net = ops.dropout(net, dropout_keep_prob, scope='dropout')
                    net = ops.flatten(net, scope='flatten')
                    # 2048
                    logits = ops.fc(net,
                                    num_classes,
                                    activation=None,
                                    scope='logits',
                                    restore=restore_logits)
                    # 1000
                    end_points['logits'] = logits
                    end_points['predictions'] = tf.nn.softmax(
                        logits, name='predictions')
            return logits, end_points
def expanded_conv(input_tensor,
                  num_outputs,
                  expansion_size=expand_input_by_factor(6),
                  stride=1,
                  rate=1,
                  kernel_size=(3, 3),
                  residual=True,
                  normalizer_fn=None,
                  split_projection=1,
                  split_expansion=1,
                  expansion_transform=None,
                  depthwise_location='expansion',
                  depthwise_channel_multiplier=1,
                  endpoints=None,
                  use_explicit_padding=False,
                  padding='SAME',
                  scope=None):
  """Depthwise Convolution Block with expansion.

  Builds a composite convolution that has the following structure
  expansion (1x1) -> depthwise (kernel_size) -> projection (1x1)
  Args:
    input_tensor: input
    num_outputs: number of outputs in the final layer.
    expansion_size: the size of expansion, could be a constant or a callable. If
      latter it will be provided 'num_inputs' as an input. For forward
      compatibility it should accept arbitrary keyword arguments. Default will
      expand the input by factor of 6.
    stride: depthwise stride
    rate: depthwise rate
    kernel_size: depthwise kernel
    residual: whether to include residual connection between input and output.
    normalizer_fn: batchnorm or otherwise
    split_projection: how many ways to split projection operator (that is conv
      expansion->bottleneck)
    split_expansion: how many ways to split expansion op (that is conv
      bottleneck->expansion) ops will keep depth divisible by this value.
    expansion_transform: Optional function that takes expansion as a single
      input and returns output.
    depthwise_location: where to put depthwise covnvolutions supported values
      None, 'input', 'output', 'expansion'
    depthwise_channel_multiplier: depthwise channel multiplier: each input will
      replicated (with different filters) that many times. So if input had c
      channels, output will have c x depthwise_channel_multpilier.
    endpoints: An optional dictionary into which intermediate endpoints are
      placed. The keys "expansion_output", "depthwise_output",
      "projection_output" and "expansion_transform" are always populated, even
      if the corresponding functions are not invoked.
    use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
      inputs so that the output dimensions are the same as if 'SAME' padding
      were used.
    padding: Padding type to use if `use_explicit_padding` is not set.
    scope: optional scope.

  Returns:
    Tensor of depth num_outputs
  Raises:
    TypeError: on inval
  """
  with tf.variable_scope(scope, default_name='expanded_conv') as s, \
       tf.name_scope(s.original_name_scope):
    prev_depth = input_tensor.get_shape().as_list()[3]
    if depthwise_location not in [None, 'input', 'output', 'expansion']:
      raise TypeError(
          '%r is unknown value for depthwise_location' % depthwise_location)
    if use_explicit_padding:
      if padding != 'SAME':
        raise TypeError('`use_explicit_padding` should only be used with '
                        '"SAME" padding.')
      padding = 'VALID'
    depthwise_func = functools.partial(
        slim.separable_conv2d,
        num_outputs=None,
        kernel_size=kernel_size,
        depth_multiplier=depthwise_channel_multiplier,
        stride=stride,
        rate=rate,
        normalizer_fn=normalizer_fn,
        padding=padding,
        scope='depthwise')
    # b1 -> b2 * r -> b2
    #   i -> (o * r) (bottleneck) -> o
    input_tensor = tf.identity(input_tensor, 'input')
    net = input_tensor

    if depthwise_location == 'input':
      if use_explicit_padding:
        net = _fixed_padding(net, kernel_size, rate)
      net = depthwise_func(net, activation_fn=None)

    if callable(expansion_size):
      inner_size = expansion_size(num_inputs=prev_depth)
    else:
      inner_size = expansion_size

    if inner_size > net.shape[3]:
      net = split_conv(
          net,
          inner_size,
          num_ways=split_expansion,
          scope='expand',
          stride=1,
          normalizer_fn=normalizer_fn)
      net = tf.identity(net, 'expansion_output')
    if endpoints is not None:
      endpoints['expansion_output'] = net

    if depthwise_location == 'expansion':
      if use_explicit_padding:
        net = _fixed_padding(net, kernel_size, rate)
      net = depthwise_func(net)

    net = tf.identity(net, name='depthwise_output')
    if endpoints is not None:
      endpoints['depthwise_output'] = net
    if expansion_transform:
      net = expansion_transform(expansion_tensor=net, input_tensor=input_tensor)
    # Note in contrast with expansion, we always have
    # projection to produce the desired output size.
    net = split_conv(
        net,
        num_outputs,
        num_ways=split_projection,
        stride=1,
        scope='project',
        normalizer_fn=normalizer_fn,
        activation_fn=tf.identity)
    if endpoints is not None:
      endpoints['projection_output'] = net
    if depthwise_location == 'output':
      if use_explicit_padding:
        net = _fixed_padding(net, kernel_size, rate)
      net = depthwise_func(net, activation_fn=None)

    if callable(residual):  # custom residual
      net = residual(input_tensor=input_tensor, output_tensor=net)
    elif (
        residual and
        # stride check enforces that we don't add residuals when spatial
        # dimensions are None
        stride == 1 and
        # Depth matches
        net.get_shape().as_list()[3] == input_tensor.get_shape().as_list()[3]):
      net += input_tensor
    return tf.identity(net, name='output')
Example #4
0
  def __init__(self, sess, ob_space, action_space, nbatch, nsteps, reuse = False):
    # This will use to initialize our kernels
    gain = np.sqrt(2)

    # Based on the action space, will select what probability distribution type
    # we will use to distribute action in our stochastic policy (in our case DiagGaussianPdType
    # aka Diagonal Gaussian, 3D normal distribution
    self.pdtype = make_pdtype(action_space)

    height, weight, channel = ob_space.shape
    ob_shape = (height, weight, channel)

    # Create the input placeholder
    inputs_ = tf.placeholder(tf.float32, [None, *ob_shape], name="input")

    # Normalize the images
    scaled_images = tf.cast(inputs_, tf.float32) / 255.

    """
    Build the model
    3 CNN for spatial dependencies
    Temporal dependencies is handle by stacking frames
    (Something funny nobody use LSTM in OpenAI Retro contest)
    1 common FC
    1 FC for policy
    1 FC for value
    """
    with tf.variable_scope("model", reuse = reuse):
      conv1 = conv_layer(scaled_images, 32, 8, 4, gain)
      conv2 = conv_layer(conv1, 64, 4, 2, gain)
      conv3 = conv_layer(conv2, 64, 3, 1, gain)
      flatten1 = tf.layers.flatten(conv3)
      fc_common = fc_layer(flatten1, 512, gain=gain)

      # This build a fc connected layer that returns a probability distribution
      # over actions (self.pd) and our pi logits (self.pi).
      self.pd, self.pi = self.pdtype.pdfromlatent(fc_common, init_scale=0.01)
      
      # Calculate the v(s)
      vf = fc_layer(fc_common, 1, activation_fn=None)[:, 0]

    self.initial_state = None

    # Take an action in the action distribution (remember we are in a situation
    # of stochastic policy so we don't always take the action with the highest probability
    # for instance if we have 2 actions 0.7 and 0.3 we have 30% chance to take the second)
    a0 = self.pd.sample()

    # Function use to take a step returns action to take and V(s)
    def step(state_in, *_args, **_kwargs):
      action, value = sess.run([a0, vf], {inputs_: state_in})
      #print("step", action)
      return action, value

    # Function that calculates only the V(s)
    def value(state_in, *_args, **_kwargs):
      return sess.run(vf, {inputs_: state_in})

    # Function that output only the action to take
    def select_action(state_in, *_args, **_kwargs):
      return sess.run(a0, {inputs_: state_in})

    self.inputs_ = inputs_
    self.vf = vf
    self.step = step
    self.value = value
    self.select_action = select_action
def conv2d(x,
           kernel_size,
           stride,
           channels,
           is_training,
           scope="conv2d",
           batch_norm=False,
           residual=False,
           gated=False,
           activation_fn=tf.nn.relu,
           resize=False,
           transpose=False,
           stacked_layers=1):
    """2D-Conv with optional batch_norm, gating, residual.

  Args:
    x: Tensor input [MB, H, W, CH].
    kernel_size: List [H, W].
    stride: List [H, W].
    channels: Int, output channels.
    is_training: Whether to collect stats for BatchNorm.
    scope: Enclosing scope name.
    batch_norm: Apply batch normalization
    residual: Residual connections, have stacked_layers >= 2.
    gated: Gating ala Wavenet.
    activation_fn: Nonlinearity function.
    resize: On transposed convolution, do ImageResize instead of conv_transpose.
    transpose: Use conv_transpose instead of conv.
    stacked_layers: Number of layers before a residual connection.

  Returns:
    x: Tensor output.
  """
    # For residual
    x0 = x
    # Choose convolution function
    conv_fn = slim.conv2d_transpose if transpose else slim.conv2d
    # Double output channels for gates
    num_outputs = channels * 2 if gated else channels
    normalizer_fn = slim.batch_norm if batch_norm else None

    with tf.variable_scope(scope + "_Layer"):
        # Apply a stack of convolutions Before adding residual
        for layer_idx in range(stacked_layers):
            with slim.arg_scope(
                    slim_batchnorm_arg_scope(is_training, activation_fn=None)):
                # Use interpolation to upsample instead of conv_transpose
                if transpose and resize:
                    unused_mb, h, w, unused_ch = x.get_shape().as_list()
                    x = tf.image.resize_images(
                        x, size=[h * stride[0], w * stride[1]], method=0)
                    stride_conv = [1, 1]
                else:
                    stride_conv = stride

                x = conv_fn(inputs=x,
                            stride=stride_conv,
                            kernel_size=kernel_size,
                            num_outputs=num_outputs,
                            normalizer_fn=normalizer_fn,
                            biases_initializer=tf.zeros_initializer(),
                            scope=scope)

                if gated:
                    with tf.variable_scope("Gated"):
                        x1, x2 = x[:, :, :, :channels], x[:, :, :, channels:]
                        if activation_fn:
                            x1, x2 = activation_fn(x1), tf.sigmoid(x2)
                        else:
                            x2 = tf.sigmoid(x2)
                        x = x1 * x2

                # Apply residual to last layer  before the last nonlinearity
                if residual and (layer_idx == stacked_layers - 1):
                    with tf.variable_scope("Residual"):
                        # Don't upsample residual in time
                        if stride[0] == 1 and stride[1] == 1:
                            channels_in = x0.get_shape().as_list()[-1]
                            # Make n_channels match for residual
                            if channels != channels_in:
                                x0 = slim.conv2d(
                                    inputs=x0,
                                    stride=[1, 1],
                                    kernel_size=[1, 1],
                                    num_outputs=channels,
                                    normalizer_fn=None,
                                    activation_fn=None,
                                    biases_initializer=tf.zeros_initializer,
                                    scope=scope + "_residual")
                                x += x0
                            else:
                                x += x0
                if activation_fn and not gated:
                    x = activation_fn(x)
        return x
    def make_model(self):

        hparams = transformer.transformer_small()
        hparams.hidden_size = 8
        hparams.filter_size = 32
        hparams.num_heads = 1
        hparams.layer_prepostprocess_dropout = 0.0

        if hparams.get("problem_hparams", None) is None:
            p_hparams = problem_hparams.test_problem_hparams(
                VOCAB_SIZE, VOCAB_SIZE, hparams)
        hparams.problem_hparams = p_hparams
        self.model = model_cls(hparams, tf.estimator.ModeKeys.TRAIN, p_hparams)

        self.placeholders['inputs'] = tf.placeholder(
            tf.int32, [None, params['maxLength']], name='inputs')
        self.placeholders['targets'] = tf.placeholder(
            tf.int32, [None, params['maxLength']], name='targets')
        features = {
            "inputs": self.placeholders['inputs'],
            "targets": self.placeholders['targets'],
            "target_space_id": 0
        }
        self.logits = self.model(features)

        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logit=tf.reshape(self.logits, [-1, VOCAB_SIZE]),
            labels=tf.reshape(features["targets"], [-1]))
        self.ops['loss'] = tf.reduce_mean(loss)
        apply_grad = tf.train.AdamOptimizer(0.001).minimize(self.ops['loss'])

        self.ops['losses'] = []
        for (internal_id, task_id) in enumerate(self.params['task_ids']):
            with tf.variable_scope("out_layer_task%i" % task_id):
                with tf.variable_scope("regression_gate"):
                    self.weights['regression_gate_task%i' % task_id] = MLP(
                        2 * self.params['hidden_size'], 1, [],
                        self.placeholders['out_layer_dropout_keep_prob'])
                with tf.variable_scope("regression"):
                    self.weights[
                        'regression_transform_task%i' % task_id] = MLP(
                            self.params['hidden_size'], 1, [],
                            self.placeholders['out_layer_dropout_keep_prob'])
                computed_values = self.gated_regression(
                    self.ops['final_node_representations'],
                    self.weights['regression_gate_task%i' % task_id],
                    self.weights['regression_transform_task%i' % task_id])
                diff = computed_values - self.placeholders['target_values'][
                    internal_id, :]
                task_target_mask = self.placeholders['target_mask'][
                    internal_id, :]
                task_target_num = tf.reduce_sum(
                    task_target_mask) + SMALL_NUMBER
                diff = diff * task_target_mask  # Mask out unused values
                self.ops['accuracy_task%i' % task_id] = tf.reduce_sum(
                    tf.abs(diff)) / task_target_num
                task_loss = tf.reduce_sum(
                    0.5 * tf.square(diff)) / task_target_num
                # Normalise loss to account for fewer task-specific examples in batch:
                task_loss = task_loss * (
                    1.0 /
                    (self.params['task_sample_ratios'].get(task_id) or 1.0))
                self.ops['losses'].append(task_loss)
        self.ops['loss'] = tf.reduce_sum(self.ops['losses'])
Example #7
0
def quantizable_concat(inputs,
                       axis,
                       is_training,
                       is_quantized=True,
                       default_min=0,
                       default_max=6,
                       ema_decay=0.999,
                       scope='quantized_concat'):
    """Concat replacement with quantization option.

  Allows concat inputs to share the same min max ranges,
  from experimental/gazelle/synthetic/model/tpu/utils.py.

  Args:
    inputs: list of tensors to concatenate.
    axis: dimension along which to concatenate.
    is_training: true if the graph is a training graph.
    is_quantized: flag to enable/disable quantization.
    default_min: default min value for fake quant op.
    default_max: default max value for fake quant op.
    ema_decay: the moving average decay for the quantization variables.
    scope: Optional scope for variable_scope.

  Returns:
    Tensor resulting from concatenation of input tensors
  """
    if is_quantized:
        with tf.variable_scope(scope):
            tf.logging.info('inputs: {}'.format(inputs))
            for t in inputs:
                tf.logging.info(t)

            min_var = _quant_var('min', default_min)
            max_var = _quant_var('max', default_max)
            if not is_training:
                # If we are building an eval graph just use the values in the variables.
                quant_inputs = [
                    tf.fake_quant_with_min_max_vars(t, min_var, max_var)
                    for t in inputs
                ]
                tf.logging.info('min_val: {}'.format(min_var))
                tf.logging.info('max_val: {}'.format(max_var))
            else:
                concat_tensors = tf.concat(inputs, axis=axis)
                tf.logging.info('concat_tensors: {}'.format(concat_tensors))
                # TFLite requires that 0.0 is always in the [min; max] range.
                range_min = tf.minimum(tf.reduce_min(concat_tensors),
                                       0.0,
                                       name='SafeQuantRangeMin')
                range_max = tf.maximum(tf.reduce_max(concat_tensors),
                                       0.0,
                                       name='SafeQuantRangeMax')
                # Otherwise we need to keep track of the moving averages of the min and
                # of the elements of the input tensor max.
                min_val = moving_averages.assign_moving_average(
                    min_var, range_min, ema_decay, name='AssignMinEma')
                max_val = moving_averages.assign_moving_average(
                    max_var, range_max, ema_decay, name='AssignMaxEma')
                tf.logging.info('min_val: {}'.format(min_val))
                tf.logging.info('max_val: {}'.format(max_val))
                quant_inputs = [
                    tf.fake_quant_with_min_max_vars(t, min_val, max_val)
                    for t in inputs
                ]
            tf.logging.info('quant_inputs: {}'.format(quant_inputs))
            outputs = tf.concat(quant_inputs, axis=axis)
            tf.logging.info('outputs: {}'.format(outputs))
    else:
        outputs = tf.concat(inputs, axis=axis)
    return outputs
Example #8
0
    def model(inputs, is_training):
        """Creation of the model graph."""
        with tf.variable_scope(name, 'resnet_model'):
            inputs = resnet_model.fixed_padding(inputs,
                                                kernel_size=3,
                                                data_format=data_format)
            padding = 'VALID'

            kernel_initializer = tf.variance_scaling_initializer()
            kernel_regularizer = contrib_layers.l2_regularizer(weight_decay)

            inputs = tf.layers.conv2d(inputs=inputs,
                                      filters=_make_divisible(32 * width),
                                      kernel_size=3,
                                      strides=2,
                                      padding=padding,
                                      use_bias=False,
                                      kernel_initializer=kernel_initializer,
                                      kernel_regularizer=kernel_regularizer,
                                      data_format=data_format,
                                      name='initial_conv')

            inputs = tf.identity(inputs, 'initial_conv')
            inputs = resnet_model.batch_norm_relu(inputs,
                                                  is_training,
                                                  data_format=data_format)

            inverted_res_block = functools.partial(
                inverted_res_block_,
                is_training=is_training,
                width=width,
                expansion_factor=expansion_factor,
                pruning_method=pruning_method,
                data_format=data_format,
                weight_decay=weight_decay)

            inputs = inverted_res_block(inputs,
                                        filters=16,
                                        stride=1,
                                        block_id=0)

            inputs = inverted_res_block(inputs,
                                        filters=24,
                                        stride=2,
                                        block_id=1)
            inputs = inverted_res_block(inputs,
                                        filters=24,
                                        stride=1,
                                        block_id=2)

            inputs = inverted_res_block(inputs,
                                        filters=32,
                                        stride=2,
                                        block_id=3)
            inputs = inverted_res_block(inputs,
                                        filters=32,
                                        stride=1,
                                        block_id=4)
            inputs = inverted_res_block(inputs,
                                        filters=32,
                                        stride=1,
                                        block_id=5)

            inputs = inverted_res_block(inputs,
                                        filters=64,
                                        stride=2,
                                        block_id=6)
            inputs = inverted_res_block(inputs,
                                        filters=64,
                                        stride=1,
                                        block_id=7)
            inputs = inverted_res_block(inputs,
                                        filters=64,
                                        stride=1,
                                        block_id=8)
            inputs = inverted_res_block(inputs,
                                        filters=64,
                                        stride=1,
                                        block_id=9)

            inputs = inverted_res_block(inputs,
                                        filters=96,
                                        stride=1,
                                        block_id=10)
            inputs = inverted_res_block(inputs,
                                        filters=96,
                                        stride=1,
                                        block_id=11)
            inputs = inverted_res_block(inputs,
                                        filters=96,
                                        stride=1,
                                        block_id=12)

            inputs = inverted_res_block(inputs,
                                        filters=160,
                                        stride=2,
                                        block_id=13)
            inputs = inverted_res_block(inputs,
                                        filters=160,
                                        stride=1,
                                        block_id=14)
            inputs = inverted_res_block(inputs,
                                        filters=160,
                                        stride=1,
                                        block_id=15)

            inputs = inverted_res_block(inputs,
                                        filters=320,
                                        stride=1,
                                        block_id=16)

            last_block_filters = max(1280, _make_divisible(1280 * width, 8))

            inputs = conv2d_fixed_padding(inputs=inputs,
                                          filters=last_block_filters,
                                          kernel_size=1,
                                          strides=1,
                                          pruning_method=pruning_method,
                                          data_format=data_format,
                                          weight_decay=weight_decay,
                                          name='final_1x1_conv')

            inputs = resnet_model.batch_norm_relu(inputs,
                                                  is_training,
                                                  data_format=data_format)

            if data_format == 'channels_last':
                pool_size = (inputs.shape[1], inputs.shape[2])
            elif data_format == 'channels_first':
                pool_size = (inputs.shape[2], inputs.shape[3])

            inputs = tf.layers.average_pooling2d(inputs=inputs,
                                                 pool_size=pool_size,
                                                 strides=1,
                                                 padding='VALID',
                                                 data_format=data_format,
                                                 name='final_avg_pool')
            inputs = tf.identity(inputs, 'final_avg_pool')
            inputs = tf.reshape(inputs, [-1, last_block_filters])

            kernel_initializer = tf.variance_scaling_initializer()

            kernel_regularizer = contrib_layers.l2_regularizer(weight_decay)
            if prune_last_layer:
                inputs = sparse_fully_connected(
                    x=inputs,
                    units=num_classes,
                    sparsity_technique=pruning_method
                    if prune_last_layer else 'baseline',
                    kernel_initializer=kernel_initializer,
                    kernel_regularizer=kernel_regularizer,
                    name='final_dense')
            else:
                inputs = tf.layers.dense(inputs=inputs,
                                         units=num_classes,
                                         activation=None,
                                         use_bias=True,
                                         kernel_initializer=kernel_initializer,
                                         kernel_regularizer=kernel_regularizer,
                                         name='final_dense')

            inputs = tf.identity(inputs, 'final_dense')
        return inputs
  def call(self, inputs, training=True, survival_prob=None):
    """Implementation of call().

    Args:
      inputs: the inputs tensor.
      training: boolean, whether the model is constructed for training.
      survival_prob: float, between 0 to 1, drop connect rate.

    Returns:
      A output tensor.
    """
    logging.info('Block input: %s shape: %s', inputs.name, inputs.shape)
    logging.info('Block input depth: %s output depth: %s',
                 self._block_args.input_filters,
                 self._block_args.output_filters)

    x = inputs

    fused_conv_fn = self._fused_conv
    expand_conv_fn = self._expand_conv
    depthwise_conv_fn = self._depthwise_conv
    project_conv_fn = self._project_conv

    if self._block_args.condconv:
      pooled_inputs = self._avg_pooling(inputs)
      routing_weights = self._routing_fn(pooled_inputs)
      # Capture routing weights as additional input to CondConv layers
      fused_conv_fn = functools.partial(
          self._fused_conv, routing_weights=routing_weights)
      expand_conv_fn = functools.partial(
          self._expand_conv, routing_weights=routing_weights)
      depthwise_conv_fn = functools.partial(
          self._depthwise_conv, routing_weights=routing_weights)
      project_conv_fn = functools.partial(
          self._project_conv, routing_weights=routing_weights)

    # creates conv 2x2 kernel
    if self._block_args.space2depth == 1:
      with tf.variable_scope('space2depth'):
        x = self._relu_fn(
            self._bnsp(self._space2depth(x), training=training))
      logging.info(
          'Block start with space2depth: %s shape: %s', x.name, x.shape)

    if self._block_args.fused_conv:
      # If use fused mbconv, skip expansion and use regular conv.
      x = self._relu_fn(self._bn1(fused_conv_fn(x), training=training))
      logging.info('Conv2D: %s shape: %s', x.name, x.shape)
    else:
      # Otherwise, first apply expansion and then apply depthwise conv.
      if self._block_args.expand_ratio != 1:
        x = self._relu_fn(self._bn0(expand_conv_fn(x), training=training))
        logging.info('Expand: %s shape: %s', x.name, x.shape)

      x = self._relu_fn(self._bn1(depthwise_conv_fn(x), training=training))
      logging.info('DWConv: %s shape: %s', x.name, x.shape)

    if self._has_se:
      with tf.variable_scope('se'):
        x = self._call_se(x)

    self.endpoints = {'expansion_output': x}

    x = self._bn2(project_conv_fn(x), training=training)
    # Add identity so that quantization-aware training can insert quantization
    # ops correctly.
    x = tf.identity(x)
    if self._clip_projection_output:
      x = tf.clip_by_value(x, -6, 6)
    if self._block_args.id_skip:
      if all(
          s == 1 for s in self._block_args.strides
      ) and inputs.get_shape().as_list()[-1] == x.get_shape().as_list()[-1]:
        # Apply only if skip connection presents.
        if survival_prob:
          x = utils.drop_connect(x, training, survival_prob)
        x = tf.add(x, inputs)
    logging.info('Project: %s shape: %s', x.name, x.shape)
    return x
Example #10
0
def inception_v3_base(inputs,
                      final_endpoint='Mixed_7c',
                      min_depth=16,
                      depth_multiplier=1.0,
                      scope=None):
    """Inception model from http://arxiv.org/abs/1512.00567.
  Constructs an Inception v3 network from inputs to the given final endpoint.
  This method can construct the network up to the final inception block
  Mixed_7c.
  Note that the names of the layers in the paper do not correspond to the names
  of the endpoints registered by this function although they build the same
  network.
  Here is a mapping from the old_names to the new names:
  Old name          | New name
  =======================================
  conv0             | Conv2d_1a_3x3
  conv1             | Conv2d_2a_3x3
  conv2             | Conv2d_2b_3x3
  pool1             | MaxPool_3a_3x3
  conv3             | Conv2d_3b_1x1
  conv4             | Conv2d_4a_3x3
  pool2             | MaxPool_5a_3x3
  mixed_35x35x256a  | Mixed_5b
  mixed_35x35x288a  | Mixed_5c
  mixed_35x35x288b  | Mixed_5d
  mixed_17x17x768a  | Mixed_6a
  mixed_17x17x768b  | Mixed_6b
  mixed_17x17x768c  | Mixed_6c
  mixed_17x17x768d  | Mixed_6d
  mixed_17x17x768e  | Mixed_6e
  mixed_8x8x1280a   | Mixed_7a
  mixed_8x8x2048a   | Mixed_7b
  mixed_8x8x2048b   | Mixed_7c
  Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    final_endpoint: specifies the endpoint to construct the network up to. It
      can be one of ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3',
      'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3',
      'Mixed_5b', 'Mixed_5c', 'Mixed_5d', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c',
      'Mixed_6d', 'Mixed_6e', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c'].
    min_depth: Minimum depth value (number of channels) for all convolution ops.
      Enforced when depth_multiplier < 1, and not an active constraint when
      depth_multiplier >= 1.
    depth_multiplier: Float multiplier for the depth (number of channels)
      for all convolution ops. The value must be greater than zero. Typical
      usage will be to set this value in (0, 1) to reduce the number of
      parameters or computation cost of the model.
    scope: Optional variable_scope.
  Returns:
    tensor_out: output tensor corresponding to the final_endpoint.
    end_points: a set of activations for external use, for example summaries or
                losses.
  Raises:
    ValueError: if final_endpoint is not set to one of the predefined values,
                or depth_multiplier <= 0
  """
    # end_points will collect relevant activations for external use, for example
    # summaries or losses.
    end_points = {}

    if depth_multiplier <= 0:
        raise ValueError('depth_multiplier is not greater than zero.')
    depth = lambda d: max(int(d * depth_multiplier), min_depth)

    with tf.variable_scope(scope, 'InceptionV3', [inputs]):
        with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
                            stride=1,
                            padding='VALID'):
            # 299 x 299 x 3
            end_point = 'Conv2d_1a_3x3'
            net = slim.conv2d(inputs,
                              depth(32), [3, 3],
                              stride=2,
                              scope=end_point)
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points
            # 149 x 149 x 32
            end_point = 'Conv2d_2a_3x3'
            net = slim.conv2d(net, depth(32), [3, 3], scope=end_point)
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points
            # 147 x 147 x 32
            end_point = 'Conv2d_2b_3x3'
            net = slim.conv2d(net,
                              depth(64), [3, 3],
                              padding='SAME',
                              scope=end_point)
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points
            # 147 x 147 x 64
            end_point = 'MaxPool_3a_3x3'
            net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point)
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points
            # 73 x 73 x 64
            end_point = 'Conv2d_3b_1x1'
            net = slim.conv2d(net, depth(80), [1, 1], scope=end_point)
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points
            # 73 x 73 x 80.
            end_point = 'Conv2d_4a_3x3'
            net = slim.conv2d(net, depth(192), [3, 3], scope=end_point)
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points
            # 71 x 71 x 192.
            end_point = 'MaxPool_5a_3x3'
            net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point)
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points
            # 35 x 35 x 192.

        # Inception blocks
        with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
                            stride=1,
                            padding='SAME'):
            # mixed: 35 x 35 x 256.
            end_point = 'Mixed_5b'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(64), [1, 1],
                                           scope='Conv2d_0a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(48), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(64), [5, 5],
                                           scope='Conv2d_0b_5x5')
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.conv2d(net,
                                           depth(64), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(96), [3, 3],
                                           scope='Conv2d_0b_3x3')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(96), [3, 3],
                                           scope='Conv2d_0c_3x3')
                with tf.variable_scope('Branch_3'):
                    branch_3 = slim.avg_pool2d(net, [3, 3],
                                               scope='AvgPool_0a_3x3')
                    branch_3 = slim.conv2d(branch_3,
                                           depth(32), [1, 1],
                                           scope='Conv2d_0b_1x1')
                net = tf.concat(
                    axis=3, values=[branch_0, branch_1, branch_2, branch_3])
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points

            # mixed_1: 35 x 35 x 288.
            end_point = 'Mixed_5c'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(64), [1, 1],
                                           scope='Conv2d_0a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(48), [1, 1],
                                           scope='Conv2d_0b_1x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(64), [5, 5],
                                           scope='Conv_1_0c_5x5')
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.conv2d(net,
                                           depth(64), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(96), [3, 3],
                                           scope='Conv2d_0b_3x3')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(96), [3, 3],
                                           scope='Conv2d_0c_3x3')
                with tf.variable_scope('Branch_3'):
                    branch_3 = slim.avg_pool2d(net, [3, 3],
                                               scope='AvgPool_0a_3x3')
                    branch_3 = slim.conv2d(branch_3,
                                           depth(64), [1, 1],
                                           scope='Conv2d_0b_1x1')
                net = tf.concat(
                    axis=3, values=[branch_0, branch_1, branch_2, branch_3])
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points

            # mixed_2: 35 x 35 x 288.
            end_point = 'Mixed_5d'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(64), [1, 1],
                                           scope='Conv2d_0a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(48), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(64), [5, 5],
                                           scope='Conv2d_0b_5x5')
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.conv2d(net,
                                           depth(64), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(96), [3, 3],
                                           scope='Conv2d_0b_3x3')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(96), [3, 3],
                                           scope='Conv2d_0c_3x3')
                with tf.variable_scope('Branch_3'):
                    branch_3 = slim.avg_pool2d(net, [3, 3],
                                               scope='AvgPool_0a_3x3')
                    branch_3 = slim.conv2d(branch_3,
                                           depth(64), [1, 1],
                                           scope='Conv2d_0b_1x1')
                net = tf.concat(
                    axis=3, values=[branch_0, branch_1, branch_2, branch_3])
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points

            # mixed_3: 17 x 17 x 768.
            end_point = 'Mixed_6a'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(384), [3, 3],
                                           stride=2,
                                           padding='VALID',
                                           scope='Conv2d_1a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(64), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(96), [3, 3],
                                           scope='Conv2d_0b_3x3')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(96), [3, 3],
                                           stride=2,
                                           padding='VALID',
                                           scope='Conv2d_1a_1x1')
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.max_pool2d(net, [3, 3],
                                               stride=2,
                                               padding='VALID',
                                               scope='MaxPool_1a_3x3')
                net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2])
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points

            # mixed4: 17 x 17 x 768.
            end_point = 'Mixed_6b'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(128), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(128), [1, 7],
                                           scope='Conv2d_0b_1x7')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(192), [7, 1],
                                           scope='Conv2d_0c_7x1')
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.conv2d(net,
                                           depth(128), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(128), [7, 1],
                                           scope='Conv2d_0b_7x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(128), [1, 7],
                                           scope='Conv2d_0c_1x7')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(128), [7, 1],
                                           scope='Conv2d_0d_7x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(192), [1, 7],
                                           scope='Conv2d_0e_1x7')
                with tf.variable_scope('Branch_3'):
                    branch_3 = slim.avg_pool2d(net, [3, 3],
                                               scope='AvgPool_0a_3x3')
                    branch_3 = slim.conv2d(branch_3,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0b_1x1')
                net = tf.concat(
                    axis=3, values=[branch_0, branch_1, branch_2, branch_3])
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points

            # mixed_5: 17 x 17 x 768.
            end_point = 'Mixed_6c'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(160), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(160), [1, 7],
                                           scope='Conv2d_0b_1x7')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(192), [7, 1],
                                           scope='Conv2d_0c_7x1')
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.conv2d(net,
                                           depth(160), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(160), [7, 1],
                                           scope='Conv2d_0b_7x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(160), [1, 7],
                                           scope='Conv2d_0c_1x7')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(160), [7, 1],
                                           scope='Conv2d_0d_7x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(192), [1, 7],
                                           scope='Conv2d_0e_1x7')
                with tf.variable_scope('Branch_3'):
                    branch_3 = slim.avg_pool2d(net, [3, 3],
                                               scope='AvgPool_0a_3x3')
                    branch_3 = slim.conv2d(branch_3,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0b_1x1')
                net = tf.concat(
                    axis=3, values=[branch_0, branch_1, branch_2, branch_3])
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points
            # mixed_6: 17 x 17 x 768.
            end_point = 'Mixed_6d'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(160), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(160), [1, 7],
                                           scope='Conv2d_0b_1x7')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(192), [7, 1],
                                           scope='Conv2d_0c_7x1')
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.conv2d(net,
                                           depth(160), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(160), [7, 1],
                                           scope='Conv2d_0b_7x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(160), [1, 7],
                                           scope='Conv2d_0c_1x7')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(160), [7, 1],
                                           scope='Conv2d_0d_7x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(192), [1, 7],
                                           scope='Conv2d_0e_1x7')
                with tf.variable_scope('Branch_3'):
                    branch_3 = slim.avg_pool2d(net, [3, 3],
                                               scope='AvgPool_0a_3x3')
                    branch_3 = slim.conv2d(branch_3,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0b_1x1')
                net = tf.concat(
                    axis=3, values=[branch_0, branch_1, branch_2, branch_3])
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points

            # mixed_7: 17 x 17 x 768.
            end_point = 'Mixed_6e'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(192), [1, 7],
                                           scope='Conv2d_0b_1x7')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(192), [7, 1],
                                           scope='Conv2d_0c_7x1')
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.conv2d(net,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(192), [7, 1],
                                           scope='Conv2d_0b_7x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(192), [1, 7],
                                           scope='Conv2d_0c_1x7')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(192), [7, 1],
                                           scope='Conv2d_0d_7x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(192), [1, 7],
                                           scope='Conv2d_0e_1x7')
                with tf.variable_scope('Branch_3'):
                    branch_3 = slim.avg_pool2d(net, [3, 3],
                                               scope='AvgPool_0a_3x3')
                    branch_3 = slim.conv2d(branch_3,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0b_1x1')
                net = tf.concat(
                    axis=3, values=[branch_0, branch_1, branch_2, branch_3])
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points

            # mixed_8: 8 x 8 x 1280.
            end_point = 'Mixed_7a'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_0 = slim.conv2d(branch_0,
                                           depth(320), [3, 3],
                                           stride=2,
                                           padding='VALID',
                                           scope='Conv2d_1a_3x3')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(192), [1, 7],
                                           scope='Conv2d_0b_1x7')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(192), [7, 1],
                                           scope='Conv2d_0c_7x1')
                    branch_1 = slim.conv2d(branch_1,
                                           depth(192), [3, 3],
                                           stride=2,
                                           padding='VALID',
                                           scope='Conv2d_1a_3x3')
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.max_pool2d(net, [3, 3],
                                               stride=2,
                                               padding='VALID',
                                               scope='MaxPool_1a_3x3')
                net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2])
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points
            # mixed_9: 8 x 8 x 2048.
            end_point = 'Mixed_7b'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(320), [1, 1],
                                           scope='Conv2d_0a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(384), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_1 = tf.concat(axis=3,
                                         values=[
                                             slim.conv2d(
                                                 branch_1,
                                                 depth(384), [1, 3],
                                                 scope='Conv2d_0b_1x3'),
                                             slim.conv2d(branch_1,
                                                         depth(384), [3, 1],
                                                         scope='Conv2d_0b_3x1')
                                         ])
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.conv2d(net,
                                           depth(448), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(384), [3, 3],
                                           scope='Conv2d_0b_3x3')
                    branch_2 = tf.concat(axis=3,
                                         values=[
                                             slim.conv2d(
                                                 branch_2,
                                                 depth(384), [1, 3],
                                                 scope='Conv2d_0c_1x3'),
                                             slim.conv2d(branch_2,
                                                         depth(384), [3, 1],
                                                         scope='Conv2d_0d_3x1')
                                         ])
                with tf.variable_scope('Branch_3'):
                    branch_3 = slim.avg_pool2d(net, [3, 3],
                                               scope='AvgPool_0a_3x3')
                    branch_3 = slim.conv2d(branch_3,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0b_1x1')
                net = tf.concat(
                    axis=3, values=[branch_0, branch_1, branch_2, branch_3])
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points

            # mixed_10: 8 x 8 x 2048.
            end_point = 'Mixed_7c'
            with tf.variable_scope(end_point):
                with tf.variable_scope('Branch_0'):
                    branch_0 = slim.conv2d(net,
                                           depth(320), [1, 1],
                                           scope='Conv2d_0a_1x1')
                with tf.variable_scope('Branch_1'):
                    branch_1 = slim.conv2d(net,
                                           depth(384), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_1 = tf.concat(axis=3,
                                         values=[
                                             slim.conv2d(
                                                 branch_1,
                                                 depth(384), [1, 3],
                                                 scope='Conv2d_0b_1x3'),
                                             slim.conv2d(branch_1,
                                                         depth(384), [3, 1],
                                                         scope='Conv2d_0c_3x1')
                                         ])
                with tf.variable_scope('Branch_2'):
                    branch_2 = slim.conv2d(net,
                                           depth(448), [1, 1],
                                           scope='Conv2d_0a_1x1')
                    branch_2 = slim.conv2d(branch_2,
                                           depth(384), [3, 3],
                                           scope='Conv2d_0b_3x3')
                    branch_2 = tf.concat(axis=3,
                                         values=[
                                             slim.conv2d(
                                                 branch_2,
                                                 depth(384), [1, 3],
                                                 scope='Conv2d_0c_1x3'),
                                             slim.conv2d(branch_2,
                                                         depth(384), [3, 1],
                                                         scope='Conv2d_0d_3x1')
                                         ])
                with tf.variable_scope('Branch_3'):
                    branch_3 = slim.avg_pool2d(net, [3, 3],
                                               scope='AvgPool_0a_3x3')
                    branch_3 = slim.conv2d(branch_3,
                                           depth(192), [1, 1],
                                           scope='Conv2d_0b_1x1')
                net = tf.concat(
                    axis=3, values=[branch_0, branch_1, branch_2, branch_3])
            end_points[end_point] = net
            if end_point == final_endpoint: return net, end_points
        raise ValueError('Unknown final endpoint %s' % final_endpoint)
Example #11
0
def inception_v3(inputs,
                 num_classes=1000,
                 is_training=True,
                 dropout_keep_prob=0.8,
                 min_depth=16,
                 depth_multiplier=1.0,
                 prediction_fn=slim.softmax,
                 spatial_squeeze=True,
                 reuse=None,
                 create_aux_logits=True,
                 scope='InceptionV3',
                 global_pool=False):
    """Inception model from http://arxiv.org/abs/1512.00567.
  "Rethinking the Inception Architecture for Computer Vision"
  Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens,
  Zbigniew Wojna.
  With the default arguments this method constructs the exact model defined in
  the paper. However, one can experiment with variations of the inception_v3
  network by changing arguments dropout_keep_prob, min_depth and
  depth_multiplier.
  The default image size used to train this network is 299x299.
  Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    num_classes: number of predicted classes. If 0 or None, the logits layer
      is omitted and the input features to the logits layer (before dropout)
      are returned instead.
    is_training: whether is training or not.
    dropout_keep_prob: the percentage of activation values that are retained.
    min_depth: Minimum depth value (number of channels) for all convolution ops.
      Enforced when depth_multiplier < 1, and not an active constraint when
      depth_multiplier >= 1.
    depth_multiplier: Float multiplier for the depth (number of channels)
      for all convolution ops. The value must be greater than zero. Typical
      usage will be to set this value in (0, 1) to reduce the number of
      parameters or computation cost of the model.
    prediction_fn: a function to get predictions out of logits.
    spatial_squeeze: if True, logits is of shape [B, C], if false logits is of
        shape [B, 1, 1, C], where B is batch_size and C is number of classes.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
    create_aux_logits: Whether to create the auxiliary logits.
    scope: Optional variable_scope.
    global_pool: Optional boolean flag to control the avgpooling before the
      logits layer. If false or unset, pooling is done with a fixed window
      that reduces default-sized inputs to 1x1, while larger inputs lead to
      larger outputs. If true, any input size is pooled down to 1x1.
  Returns:
    net: a Tensor with the logits (pre-softmax activations) if num_classes
      is a non-zero integer, or the non-dropped-out input to the logits layer
      if num_classes is 0 or None.
    end_points: a dictionary from components of the network to the corresponding
      activation.
  Raises:
    ValueError: if 'depth_multiplier' is less than or equal to zero.
  """
    if depth_multiplier <= 0:
        raise ValueError('depth_multiplier is not greater than zero.')
    depth = lambda d: max(int(d * depth_multiplier), min_depth)

    with tf.variable_scope(scope, 'InceptionV3', [inputs],
                           reuse=reuse) as scope:
        with slim.arg_scope([slim.batch_norm, slim.dropout],
                            is_training=is_training):
            net, end_points = inception_v3_base(
                inputs,
                scope=scope,
                min_depth=min_depth,
                depth_multiplier=depth_multiplier)

            # Auxiliary Head logits
            if create_aux_logits and num_classes:
                with slim.arg_scope(
                    [slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
                        stride=1,
                        padding='SAME'):
                    aux_logits = end_points['Mixed_6e']
                    with tf.variable_scope('AuxLogits'):
                        aux_logits = slim.avg_pool2d(aux_logits, [5, 5],
                                                     stride=3,
                                                     padding='VALID',
                                                     scope='AvgPool_1a_5x5')
                        aux_logits = slim.conv2d(aux_logits,
                                                 depth(128), [1, 1],
                                                 scope='Conv2d_1b_1x1')

                        # Shape of feature map before the final layer.
                        kernel_size = _reduced_kernel_size_for_small_input(
                            aux_logits, [5, 5])
                        aux_logits = slim.conv2d(
                            aux_logits,
                            depth(768),
                            kernel_size,
                            weights_initializer=trunc_normal(0.01),
                            padding='VALID',
                            scope='Conv2d_2a_{}x{}'.format(*kernel_size))
                        aux_logits = slim.conv2d(
                            aux_logits,
                            num_classes, [1, 1],
                            activation_fn=None,
                            normalizer_fn=None,
                            weights_initializer=trunc_normal(0.001),
                            scope='Conv2d_2b_1x1')
                        if spatial_squeeze:
                            aux_logits = tf.squeeze(aux_logits, [1, 2],
                                                    name='SpatialSqueeze')
                        end_points['AuxLogits'] = aux_logits

            # Final pooling and prediction
            with tf.variable_scope('Logits'):
                if global_pool:
                    # Global average pooling.
                    net = tf.reduce_mean(input_tensor=net,
                                         axis=[1, 2],
                                         keepdims=True,
                                         name='GlobalPool')
                    end_points['global_pool'] = net
                else:
                    # Pooling with a fixed kernel size.
                    kernel_size = _reduced_kernel_size_for_small_input(
                        net, [8, 8])
                    net = slim.avg_pool2d(
                        net,
                        kernel_size,
                        padding='VALID',
                        scope='AvgPool_1a_{}x{}'.format(*kernel_size))
                    end_points['AvgPool_1a'] = net
                if not num_classes:
                    return net, end_points
                # 1 x 1 x 2048
                net = slim.dropout(net,
                                   keep_prob=dropout_keep_prob,
                                   scope='Dropout_1b')
                end_points['PreLogits'] = net
                # 2048
                logits = slim.conv2d(net,
                                     num_classes, [1, 1],
                                     activation_fn=None,
                                     normalizer_fn=None,
                                     scope='Conv2d_1c_1x1')
                if spatial_squeeze:
                    logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze')
                # 1000
            end_points['Logits'] = logits
            end_points['Predictions'] = prediction_fn(logits,
                                                      scope='Predictions')
    return logits, end_points
Example #12
0
def transformer_model(
    input_tensor,
    attention_mask=None,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    intermediate_act_fn=gelu,
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    initializer_range=0.02,
    do_return_all_layers=False,
):
    """Multi-headed, multi-layer Transformer from "Attention is All You Need".

    This is almost an exact implementation of the original Transformer encoder.

    See the original paper:
    https://arxiv.org/abs/1706.03762

    Also see:
    https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py

    Args:
      input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
      attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
        seq_length], with 1 for positions that can be attended to and 0 in
        positions that should not be.
      hidden_size: int. Hidden size of the Transformer.
      num_hidden_layers: int. Number of layers (blocks) in the Transformer.
      num_attention_heads: int. Number of attention heads in the Transformer.
      intermediate_size: int. The size of the "intermediate" (a.k.a., feed
        forward) layer.
      intermediate_act_fn: function. The non-linear activation function to apply
        to the output of the intermediate/feed-forward layer.
      hidden_dropout_prob: float. Dropout probability for the hidden layers.
      attention_probs_dropout_prob: float. Dropout probability of the attention
        probabilities.
      initializer_range: float. Range of the initializer (stddev of truncated
        normal).
      do_return_all_layers: Whether to also return all layers or just the final
        layer.

    Returns:
      float Tensor of shape [batch_size, seq_length, hidden_size], the final
      hidden layer of the Transformer.

    Raises:
      ValueError: A Tensor shape or parameter is invalid.
    """
    if hidden_size % num_attention_heads != 0:
        raise ValueError(
            "The hidden size (%d) is not a multiple of the number of attention "
            "heads (%d)" % (hidden_size, num_attention_heads)
        )

    attention_head_size = int(hidden_size / num_attention_heads)
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    input_width = input_shape[2]

    # The Transformer performs sum residuals on all layers so the input needs
    # to be the same as the hidden size.
    if input_width != hidden_size:
        raise ValueError(
            "The width of the input tensor (%d) != hidden size (%d)"
            % (input_width, hidden_size)
        )

    # We keep the representation as a 2D tensor to avoid re-shaping it back and
    # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
    # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
    # help the optimizer.
    prev_output = reshape_to_matrix(input_tensor)

    attn_maps = []
    all_layer_outputs = []
    for layer_idx in range(num_hidden_layers):
        with tf.variable_scope("layer_%d" % layer_idx):
            with tf.variable_scope("attention"):
                attention_heads = []
                with tf.variable_scope("self"):
                    attention_head, probs = attention_layer(
                        from_tensor=prev_output,
                        to_tensor=prev_output,
                        attention_mask=attention_mask,
                        num_attention_heads=num_attention_heads,
                        size_per_head=attention_head_size,
                        attention_probs_dropout_prob=attention_probs_dropout_prob,
                        initializer_range=initializer_range,
                        do_return_2d_tensor=True,
                        batch_size=batch_size,
                        from_seq_length=seq_length,
                        to_seq_length=seq_length,
                    )
                    attention_heads.append(attention_head)
                    attn_maps.append(probs)

                attention_output = None
                if len(attention_heads) == 1:
                    attention_output = attention_heads[0]
                else:
                    # In the case where we have other sequences, we just concatenate
                    # them to the self-attention head before the projection.
                    attention_output = tf.concat(attention_heads, axis=-1)

                # Run a linear projection of `hidden_size` then add a residual
                # with `layer_input`.
                with tf.variable_scope("output"):
                    attention_output = tf.layers.dense(
                        attention_output,
                        hidden_size,
                        kernel_initializer=create_initializer(initializer_range),
                    )
                    attention_output = dropout(attention_output, hidden_dropout_prob)
                    attention_output = layer_norm(attention_output + prev_output)

            # The activation is only applied to the "intermediate" hidden layer.
            with tf.variable_scope("intermediate"):
                intermediate_output = tf.layers.dense(
                    attention_output,
                    intermediate_size,
                    activation=intermediate_act_fn,
                    kernel_initializer=create_initializer(initializer_range),
                )

            # Down-project back to `hidden_size` then add the residual.
            with tf.variable_scope("output"):
                prev_output = tf.layers.dense(
                    intermediate_output,
                    hidden_size,
                    kernel_initializer=create_initializer(initializer_range),
                )
                prev_output = dropout(prev_output, hidden_dropout_prob)
                prev_output = layer_norm(prev_output + attention_output)
                all_layer_outputs.append(prev_output)

    attn_maps = tf.stack(attn_maps, 0)
    if do_return_all_layers:
        return (
            tf.stack(
                [
                    reshape_from_matrix(layer, input_shape)
                    for layer in all_layer_outputs
                ],
                0,
            ),
            attn_maps,
        )
    else:
        return reshape_from_matrix(prev_output, input_shape), attn_maps
Example #13
0
    def __init__(
        self,
        bert_config,
        is_training,
        input_ids,
        input_mask=None,
        token_type_ids=None,
        use_one_hot_embeddings=True,
        scope=None,
        embedding_size=None,
        input_embeddings=None,
        input_reprs=None,
        update_embeddings=True,
        untied_embeddings=False,
    ):
        """Constructor for BertModel.

        Args:
          bert_config: `BertConfig` instance.
          is_training: bool. true for training model, false for eval model. Controls
            whether dropout will be applied.
          input_ids: int32 Tensor of shape [batch_size, seq_length].
          input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
          token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
          use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
            embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
            it is much faster if this is True, on the CPU or GPU, it is faster if
            this is False.
          scope: (optional) variable scope. Defaults to "electra".

        Raises:
          ValueError: The config is invalid or one of the input tensor shapes
            is invalid.
        """
        bert_config = copy.deepcopy(bert_config)
        if not is_training:
            bert_config.hidden_dropout_prob = 0.0
            bert_config.attention_probs_dropout_prob = 0.0

        input_shape = get_shape_list(token_type_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)

        assert token_type_ids is not None

        if input_reprs is None:
            if input_embeddings is None:
                with tf.variable_scope(
                    (scope if untied_embeddings else "electra") + "/embeddings",
                    reuse=tf.AUTO_REUSE,
                ):
                    # Perform embedding lookup on the word ids
                    if embedding_size is None:
                        embedding_size = bert_config.hidden_size
                    (self.token_embeddings, self.embedding_table) = embedding_lookup(
                        input_ids=input_ids,
                        vocab_size=bert_config.vocab_size,
                        embedding_size=embedding_size,
                        initializer_range=bert_config.initializer_range,
                        word_embedding_name="word_embeddings",
                        use_one_hot_embeddings=use_one_hot_embeddings,
                    )
            else:
                self.token_embeddings = input_embeddings

            with tf.variable_scope(
                (scope if untied_embeddings else "electra") + "/embeddings",
                reuse=tf.AUTO_REUSE,
            ):
                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = embedding_postprocessor(
                    input_tensor=self.token_embeddings,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=bert_config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=bert_config.initializer_range,
                    max_position_embeddings=bert_config.max_position_embeddings,
                    dropout_prob=bert_config.hidden_dropout_prob,
                )
        else:
            self.embedding_output = input_reprs
        if not update_embeddings:
            self.embedding_output = tf.stop_gradient(self.embedding_output)

        with tf.variable_scope(scope, default_name="electra"):
            if self.embedding_output.shape[-1] != bert_config.hidden_size:
                self.embedding_output = tf.layers.dense(
                    self.embedding_output,
                    bert_config.hidden_size,
                    name="embeddings_project",
                )

            with tf.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = create_attention_mask_from_input_mask(
                    token_type_ids, input_mask
                )

                # Run the stacked transformer. Output shapes
                # sequence_output: [batch_size, seq_length, hidden_size]
                # pooled_output: [batch_size, hidden_size]
                # all_encoder_layers: [n_layers, batch_size, seq_length, hidden_size].
                # attn_maps: [n_layers, batch_size, n_heads, seq_length, seq_length]
                (self.all_layer_outputs, self.attn_maps) = transformer_model(
                    input_tensor=self.embedding_output,
                    attention_mask=attention_mask,
                    hidden_size=bert_config.hidden_size,
                    num_hidden_layers=bert_config.num_hidden_layers,
                    num_attention_heads=bert_config.num_attention_heads,
                    intermediate_size=bert_config.intermediate_size,
                    intermediate_act_fn=get_activation(bert_config.hidden_act),
                    hidden_dropout_prob=bert_config.hidden_dropout_prob,
                    attention_probs_dropout_prob=bert_config.attention_probs_dropout_prob,
                    initializer_range=bert_config.initializer_range,
                    do_return_all_layers=True,
                )
                self.sequence_output = self.all_layer_outputs[-1]
                self.pooled_output = self.sequence_output[:, 0]
Example #14
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))

        input_ids = features["input_ids"]
        label_ids = features["label_ids"]
        if "is_real_example" in features:
            is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
        else:
            is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        # Create model with aux loss
        model = GroverModel(
            config=config,
            is_training=is_training,
            input_ids=input_ids,
            pad_token_id=config.pad_token_id,
            chop_off_last_token=False,
        )

        with tf.variable_scope('classification'):
            hidden_state = model.pooled_output(pool_token_id)
            if is_training:
                hidden_state = dropout(hidden_state, dropout_prob=0.1)
            logits = tf.layers.dense(
                hidden_state,
                num_labels,
                kernel_initializer=create_initializer(config.initializer_range),
                name='logits'
            )
            log_probs = tf.nn.log_softmax(logits, axis=-1)
            one_hot_labels = tf.one_hot(label_ids, depth=num_labels, dtype=tf.float32)
            per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
            class_loss = tf.reduce_mean(per_example_loss)

        total_loss = lm_loss_coef * model.lm_loss() + class_loss

        if is_training:
            train_op, train_metrics = optimization_adafactor.create_optimizer(
                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
            # tvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
            tvars = tf.trainable_variables()

            train_metrics['minibatch_cls_loss'] = class_loss
            train_metrics['minibatch_acc'] = tf.reduce_mean(
                tf.cast(tf.equal(tf.argmax(logits, axis=-1, output_type=tf.int32),
                                 label_ids), tf.float32))
        else:
            train_op = None
            train_metrics = {}
            tvars = tf.trainable_variables()

        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
            if use_tpu:
                def tpu_scaffold():
                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
                    return tf.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

        tf.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                            init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            if use_tpu:
                output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    train_op=train_op,
                    host_call=construct_scalar_host_call(metric_dict=train_metrics, model_dir=params['model_dir'],
                                                         prefix='training/'),
                    scaffold_fn=scaffold_fn)
            else:
                output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    train_op=train_op,
                    training_hooks=[
                        tf.train.LoggingTensorHook({'loss': tf.metrics.mean(total_loss)[1]}, every_n_iter=100)],
                    scaffold_fn=scaffold_fn)

        elif mode == tf.estimator.ModeKeys.EVAL:
            def metric_fn(per_example_loss, label_ids, logits, is_real_example):
                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                accuracy = tf.metrics.accuracy(
                    labels=label_ids, predictions=predictions, weights=is_real_example)
                loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
                return {
                    "eval_accuracy": accuracy,
                    "eval_loss": loss,
                }

            eval_metrics = (metric_fn,
                            [per_example_loss, label_ids, logits, is_real_example])
            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metrics=eval_metrics,
                scaffold_fn=scaffold_fn)
        else:
            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                predictions={'logits': logits,
                             'probs': tf.nn.softmax(logits, axis=-1)},
                scaffold_fn=scaffold_fn)
        return output_spec
Example #15
0
File: ADER.py Project: rcdnn/ADER
    def __init__(self, item_num, args, reuse=None):
        self.args = args
        self.is_training = tf.placeholder(tf.bool, shape=())
        self.input_seq = tf.placeholder(tf.int32, shape=(None, args.maxlen))
        self.pos = tf.placeholder(tf.int32, shape=None)
        self.exemplar_logits = tf.placeholder(tf.float32, shape=(None, None))
        self.exemplar_pos = tf.placeholder(tf.int32, shape=None)
        self.max_item = tf.placeholder(tf.int32, shape=())
        self.lr = tf.placeholder(tf.float32, shape=())
        self.dropout_rate = tf.placeholder(tf.float32, shape=())
        pos = self.pos
        mask = tf.expand_dims(tf.to_float(tf.not_equal(self.input_seq, 0)), -1)

        with tf.variable_scope("SASRec", reuse=reuse):
            # sequence embedding, item embedding table
            self.seq, item_emb_table = embedding(self.input_seq,
                                                 vocab_size=item_num + 1,
                                                 num_units=args.hidden_units,
                                                 zero_pad=True,
                                                 scale=True,
                                                 l2_reg=args.l2_emb,
                                                 scope="input_embeddings",
                                                 with_t=True,
                                                 reuse=reuse
                                                 )

            # # Positional Encoding
            t, pos_emb_table = embedding(
                tf.tile(tf.expand_dims(tf.range(tf.shape(self.input_seq)[1]), 0), [tf.shape(self.input_seq)[0], 1]),
                vocab_size=args.maxlen,
                num_units=args.hidden_units,
                zero_pad=False,
                scale=False,
                l2_reg=args.l2_emb,
                scope="dec_pos",
                reuse=reuse,
                with_t=True
            )
            self.seq += t

            # Dropout
            self.seq = tf.layers.dropout(self.seq,
                                         rate=self.dropout_rate,
                                         training=tf.convert_to_tensor(self.is_training),
                                         seed=args.random_seed)

            self.seq *= mask

            # Build blocks
            for i in range(args.num_blocks):
                with tf.variable_scope("num_blocks_%d" % i):
                    # Self-attention
                    self.seq = multihead_attention(queries=normalize(self.seq),
                                                   keys=self.seq,
                                                   num_units=args.hidden_units,
                                                   num_heads=args.num_heads,
                                                   dropout_rate=self.dropout_rate,
                                                   seed=args.random_seed,
                                                   is_training=self.is_training,
                                                   causality=True,
                                                   scope="self_attention")

                    # Feed forward
                    self.seq = feedforward(normalize(self.seq), num_units=[args.hidden_units, args.hidden_units],
                                           dropout_rate=self.dropout_rate, is_training=self.is_training,
                                           seed=args.random_seed)
                    self.seq *= mask

            self.seq = normalize(self.seq)

        # find representation
        self.rep = self.seq[:, -1, :]

        # define loss
        seq_emb = tf.reshape(self.rep, [tf.shape(self.input_seq)[0], args.hidden_units])
        indices = pos - 1
        self.labels = tf.one_hot(indices, self.max_item)
        item_emb = tf.nn.embedding_lookup(item_emb_table, tf.range(1, self.max_item + 1))
        self.logits = tf.matmul(seq_emb, tf.transpose(item_emb))
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=self.logits))

        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)

        # prediction
        self.test_item = tf.placeholder(tf.int32, shape=None)
        self.test_item_emb = tf.nn.embedding_lookup(item_emb_table, self.test_item)
        self.test_logits = tf.matmul(seq_emb, tf.transpose(self.test_item_emb))
        self.test_logits = tf.reshape(self.test_logits, [tf.shape(self.input_seq)[0], tf.shape(self.test_item)[0]])
        self.pred_last = tf.argsort(tf.argsort(-self.test_logits))
  def call(self,
           inputs,
           training=True,
           features_only=None,
           pooled_features_only=False):
    """Implementation of call().

    Args:
      inputs: input tensors.
      training: boolean, whether the model is constructed for training.
      features_only: build the base feature network only.
      pooled_features_only: build the base network for features extraction
        (after 1x1 conv layer and global pooling, but before dropout and fc
        head).

    Returns:
      output tensors.
    """
    outputs = None
    self.endpoints = {}
    reduction_idx = 0
    # Calls Stem layers
    with tf.variable_scope('stem'):
      outputs = self._relu_fn(
          self._bn0(self._conv_stem(inputs), training=training))
    logging.info('Built stem layers with output shape: %s', outputs.shape)
    self.endpoints['stem'] = outputs

    # Calls blocks.
    for idx, block in enumerate(self._blocks):
      is_reduction = False  # reduction flag for blocks after the stem layer
      # If the first block has space-to-depth layer, then stem is
      # the first reduction point.
      if (block.block_args().space2depth == 1 and idx == 0):
        reduction_idx += 1
        self.endpoints['reduction_%s' % reduction_idx] = outputs

      elif ((idx == len(self._blocks) - 1) or
            self._blocks[idx + 1].block_args().strides[0] > 1):
        is_reduction = True
        reduction_idx += 1

      with tf.variable_scope('blocks_%s' % idx):
        survival_prob = self._global_params.survival_prob
        if survival_prob:
          drop_rate = 1.0 - survival_prob
          survival_prob = 1.0 - drop_rate * float(idx) / len(self._blocks)
          logging.info('block_%s survival_prob: %s', idx, survival_prob)
        outputs = block.call(
            outputs, training=training, survival_prob=survival_prob)
        self.endpoints['block_%s' % idx] = outputs
        if is_reduction:
          self.endpoints['reduction_%s' % reduction_idx] = outputs
        if block.endpoints:
          for k, v in six.iteritems(block.endpoints):
            self.endpoints['block_%s/%s' % (idx, k)] = v
            if is_reduction:
              self.endpoints['reduction_%s/%s' % (reduction_idx, k)] = v
    self.endpoints['features'] = outputs

    if not features_only:
      # Calls final layers and returns logits.
      with tf.variable_scope('head'):
        outputs = self._relu_fn(
            self._bn1(self._conv_head(outputs), training=training))
        self.endpoints['head_1x1'] = outputs

        if self._global_params.local_pooling:
          shape = outputs.get_shape().as_list()
          kernel_size = [
              1, shape[self._spatial_dims[0]], shape[self._spatial_dims[1]], 1]
          outputs = tf.nn.avg_pool(
              outputs, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID')
          self.endpoints['pooled_features'] = outputs
          if not pooled_features_only:
            if self._dropout:
              outputs = self._dropout(outputs, training=training)
            self.endpoints['global_pool'] = outputs
            if self._fc:
              outputs = tf.squeeze(outputs, self._spatial_dims)
              outputs = self._fc(outputs)
            self.endpoints['head'] = outputs
        else:
          outputs = self._avg_pooling(outputs)
          self.endpoints['pooled_features'] = outputs
          if not pooled_features_only:
            if self._dropout:
              outputs = self._dropout(outputs, training=training)
            self.endpoints['global_pool'] = outputs
            if self._fc:
              outputs = self._fc(outputs)
            self.endpoints['head'] = outputs
    return outputs
Example #17
0
    def __init__(
            self,
            config,
            is_training,
            input_ids,
            attention_mask=None,
            token_type_ids=None,
            return_pool=True,
            scope=None,
            reuse=False,
            compute_type=tf.float32
    ):
        super().__init__(config, is_training)

        input_shape = model_utils.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if attention_mask is None:
            attention_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int64)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int64)

        with tf.variable_scope(
                scope, default_name="bert",
                reuse=tf.AUTO_REUSE if reuse else None,
                custom_getter=model_utils.get_custom_getter(compute_type)):
            with tf.variable_scope("embeddings"):
                self.embedding_output, self.embedding_table = bert_embedding(
                    config=self.config,
                    input_ids=input_ids,
                    token_type_ids=token_type_ids,
                    add_position_embedding=True
                )

            with tf.variable_scope("encoder"):
                attention_mask = model_utils.create_bert_mask(
                    input_ids, attention_mask)
                if model_utils.get_shape_list(self.embedding_output)[-1] != self.config.hidden_size:
                    self.embedding_output = layers.dense(
                        self.embedding_output, self.config.hidden_size,
                        'embedding_hidden_mapping_in', initializer_range=self.config.initializer_range
                    )
                encoder_outputs = bert_encoder(
                    input_tensor=tf.saturate_cast(self.embedding_output, compute_type),
                    attention_mask=attention_mask,
                    config=self.config,
                    use_relative_position=False
                )
            if return_pool:
                with tf.variable_scope("pooler"):
                    pooled_output = layers.pooler_layer(
                        sequence_output=encoder_outputs[0],
                        hidden_size=self.config.hidden_size,
                        initializer_range=self.config.initializer_range
                    )
            else:
                pooled_output = None
        # (pooled output, sequence output, all layer outputs, all layer att probs)
        self.outputs = (pooled_output,) + encoder_outputs
Example #18
0
    def __init__(self, *, policy, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None, np_mask=None):
        self.sess = sess = get_session()

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess, np_mask=np_mask, is_act_model=True)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess, np_mask=np_mask, is_act_model=False)
            else:
                train_model = policy(microbatch_size, nsteps, sess, np_mask=np_mask, is_act_model=False)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]


        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        initialize()
Example #19
0
    ## The construction phase

    zeros = tf.reshape(tf.convert_to_tensor(np.zeros(x.shape)), shape=(-1, 1))
    x = tf.reshape(tf.convert_to_tensor(x), shape=(-1, 1))
    t = tf.reshape(tf.convert_to_tensor(t), shape=(-1, 1))

    points = tf.concat([x, t], 1)

    num_iter = 10000
    num_hidden_neurons = [20, 20, 20]

    X = tf.convert_to_tensor(X)
    T = tf.convert_to_tensor(T)

    with tf.variable_scope('dnn'):
        num_hidden_layers = np.size(num_hidden_neurons)

        previous_layer = points

        for l in range(num_hidden_layers):
            current_layer = tf.layers.dense(previous_layer,
                                            num_hidden_neurons[l],
                                            activation=tf.nn.sigmoid)
            previous_layer = current_layer

        dnn_output = tf.layers.dense(previous_layer, 1)

    def initial_conditions(x):
        return tf.sin(np.pi * x)
Example #20
0
def multihead_attention(queries,
                        keys,
                        times=None,
                        num_units=None,
                        num_heads=1,
                        dropout_rate=0,
                        is_training=True,
                        use_prior="none",
                        causality=True,
                        scope="multihead_attention",
                        residual=False,
                        time_exp_base=None,
                        overlapping_chunks=None,
                        reuse=None,
                        with_qk=False):
    """Applies multihead attention.

  Args:
    queries: A 3d tensor with shape of [N, T_q, C_q].
    keys: A 3d tensor with shape of [N, T_k, C_k].
    times: A 3d tensor with shape of [N, T_q, T_k].
    num_units: A scalar. Attention size.
    num_heads: An int. Number of heads.
    dropout_rate: A floating point number.
    is_training: Boolean. Controller of mechanism for dropout.
    use_prior: String. Whether to use prior for attention heads. Supported
      values include: none, position.
    causality: Boolean. If true, units that reference the future are masked.
    scope: Optional scope for `variable_scope`.
    residual: Boolean. Whether to use residual connection.
    time_exp_base: A scalar. Base for exponential time intervals. Only used for
      the case where use_prior='time'.
    overlapping_chunks: Boolean. Whether to use (non)/overlapping chunks for the
      case where use_prior='time'.
    reuse: Boolean, whether to reuse the weights of a previous layer by the
      same name.  Returns A 3d tensor with shape of (N, T_q, C)
    with_qk: Whether to use qk.
  Returns:
    Output of multihead attention.
  """
    tf.logging.info(
        "Computing attention with prior: {} and num of heads: {}".format(
            use_prior, num_heads))
    with tf.variable_scope(scope, reuse=reuse):
        # Set the fall back option for num_units
        if num_units is None:
            num_units = queries.get_shape().as_list[-1]

        # pylint: disable=invalid-name
        # Linear projections
        # Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu)
        # K = tf.layers.dense(keys, num_units, activation=tf.nn.relu)
        # V = tf.layers.dense(keys, num_units, activation=tf.nn.relu)
        Q = tf.layers.dense(queries, num_units, activation=None)  # (N, T_q, C)
        K = tf.layers.dense(keys, num_units, activation=None)  # (N, T_k, C)
        V = tf.layers.dense(keys, num_units, activation=None)  # (N, T_k, C)

        # Split and concat
        Q_ = tf.concat(tf.split(Q, num_heads, axis=2),
                       axis=0)  # (h*N, T_q, C/h)
        K_ = tf.concat(tf.split(K, num_heads, axis=2),
                       axis=0)  # (h*N, T_k, C/h)
        V_ = tf.concat(tf.split(V, num_heads, axis=2),
                       axis=0)  # (h*N, T_k, C/h)
        # pylint: enable=invalid-name

        # Multiplication
        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))  # (h*N, T_q, T_k)

        # Scale
        outputs = outputs / (K_.get_shape().as_list()[-1]**0.5)

        # Key Masking
        key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1)))  # (N, T_k)
        key_masks = tf.tile(key_masks, [num_heads, 1])  # (h*N, T_k)
        key_masks = tf.tile(tf.expand_dims(key_masks, 1),
                            [1, tf.shape(queries)[1], 1])  # (h*N, T_q, T_k)

        paddings = tf.ones_like(outputs) * (-2**32 + 1)
        outputs = tf.where(tf.equal(key_masks, 0), paddings,
                           outputs)  # (h*N, T_q, T_k)

        # Causality = Future blinding
        if causality:
            diag_vals = tf.ones_like(outputs[0, :, :])  # (T_q, T_k)
            tril = tf.linalg.LinearOperatorLowerTriangular(
                diag_vals).to_dense()  # (T_q, T_k)
            masks = tf.tile(tf.expand_dims(tril, 0),
                            [tf.shape(outputs)[0], 1, 1])  # (h*N, T_q, T_k)

            paddings = tf.ones_like(masks) * (-2**32 + 1)
            outputs = tf.where(tf.equal(masks, 0), paddings,
                               outputs)  # (h*N, T_q, T_k)

        # Position/Time prior is only used in multi-head case.
        if num_heads > 1:
            # Scaling head weights with position prior.
            if use_prior == "position":
                # Each head focuses on a window of items whose size is computed below.
                attn_size = int(outputs.get_shape().as_list()[-1] / num_heads)
                outputs = tf.concat(_compute_head_weights_with_position_prior(
                    outputs, masks, paddings, num_heads, attn_size),
                                    axis=0)  # (H*N, T_q, T_k)
                tf.logging.info(
                    "After position-wise sliding window attention.")
                tf.logging.info(outputs.shape)

            # Scaling head weights with time prior.
            elif use_prior == "time":
                # Convert time deltas from seconds to days.
                if times is None:
                    raise ValueError("Times tensor is needed.")
                time_deltas = _compute_time_deltas(times) / SECS_TO_DAYS
                outputs = tf.concat(_compute_head_weights_with_time_prior(
                    outputs, paddings, time_deltas, num_heads, time_exp_base,
                    overlapping_chunks),
                                    axis=0)  # (H*N, T_q, T_k)

        # Activation
        outputs = tf.nn.softmax(outputs)  # (h*N, T_q, T_k)

        # Query Masking
        query_masks = tf.sign(tf.abs(tf.reduce_sum(queries,
                                                   axis=-1)))  # (N, T_q)
        query_masks = tf.tile(query_masks, [num_heads, 1])  # (h*N, T_q)
        query_masks = tf.tile(tf.expand_dims(query_masks, -1),
                              [1, 1, tf.shape(keys)[1]])  # (h*N, T_q, T_k)
        outputs *= query_masks  # broadcasting. (h*N, T_q, C)

        # Dropouts
        outputs = tf.layers.dropout(outputs,
                                    rate=dropout_rate,
                                    training=tf.convert_to_tensor(is_training))

        # Weighted sum
        outputs = tf.matmul(outputs, V_)  # (h*N, T_q, C/h)

        # Restore shape
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0),
                            axis=2)  # (N, T_q, C)

        # Residual connection
        if residual:
            outputs += queries

    if with_qk:
        return Q, K
    else:
        return outputs
def build_fc_densenet(inputs,
                      num_classes,
                      preset_model='FC-DenseNet56',
                      n_filters_first_conv=48,
                      n_pool=5,
                      growth_rate=12,
                      n_layers_per_block=4,
                      dropout_p=0.2,
                      scope=None,
                      is_training=True):
    """
    Builds the FC-DenseNet model

    Arguments:
      inputs: the input tensor
      preset_model: The model you want to use
      n_classes: number of classes
      n_filters_first_conv: number of filters for the first convolution applied
      n_pool: number of pooling layers = number of transition down = number of transition up
      growth_rate: number of new feature maps created by each layer in a dense block
      n_layers_per_block: number of layers per block. Can be an int or a list of size 2 * n_pool + 1
      dropout_p: dropout rate applied after each convolution (0. for not using)

    Returns:
      Fc-DenseNet model
    """
    if not is_training:
        #No dropout when predicting
        dropout_p = 0
    if preset_model == 'FC-DenseNet56':
        n_pool = 5
        growth_rate = 12
        n_layers_per_block = 4
    elif preset_model == 'FC-DenseNet67':
        n_pool = 5
        growth_rate = 16
        n_layers_per_block = 5
    elif preset_model == 'FC-DenseNet103':
        n_pool = 5
        growth_rate = 16
        n_layers_per_block = [4, 5, 7, 10, 12, 15, 12, 10, 7, 5, 4]
    else:
        raise ValueError(
            "Unsupported FC-DenseNet model '%s'. This function only supports FC-DenseNet56, FC-DenseNet67, and FC-DenseNet103"
            % (preset_model))

    if type(n_layers_per_block) == list:
        assert (len(n_layers_per_block) == 2 * n_pool + 1)
    elif type(n_layers_per_block) == int:
        n_layers_per_block = [n_layers_per_block] * (2 * n_pool + 1)
    else:
        raise ValueError

    with tf.variable_scope(scope, preset_model, [inputs]) as sc:

        #####################
        # First Convolution #
        #####################
        # We perform a first convolution.
        stack = slim.conv2d(inputs,
                            n_filters_first_conv, [3, 3],
                            scope='first_conv',
                            activation_fn=None)

        n_filters = n_filters_first_conv

        #####################
        # Downsampling path #
        #####################

        skip_connection_list = []

        for i in range(n_pool):
            # Dense Block
            stack, _ = DenseBlock(stack,
                                  n_layers_per_block[i],
                                  growth_rate,
                                  dropout_p,
                                  scope='denseblock%d' % (i + 1))
            n_filters += growth_rate * n_layers_per_block[i]
            # At the end of the dense block, the current stack is stored in the skip_connections list
            skip_connection_list.append(stack)

            # Transition Down
            stack = TransitionDown(stack,
                                   n_filters,
                                   dropout_p,
                                   scope='transitiondown%d' % (i + 1))

        skip_connection_list = skip_connection_list[::-1]

        #####################
        #     Bottleneck    #
        #####################

        # Dense Block
        # We will only upsample the new feature maps
        stack, block_to_upsample = DenseBlock(stack,
                                              n_layers_per_block[n_pool],
                                              growth_rate,
                                              dropout_p,
                                              scope='denseblock%d' %
                                              (n_pool + 1))

        #######################
        #   Upsampling path   #
        #######################

        for i in range(n_pool):
            # Transition Up ( Upsampling + concatenation with the skip connection)
            n_filters_keep = growth_rate * n_layers_per_block[n_pool + i]
            stack = TransitionUp(block_to_upsample,
                                 skip_connection_list[i],
                                 n_filters_keep,
                                 scope='transitionup%d' % (n_pool + i + 1))

            # Dense Block
            # We will only upsample the new feature maps
            stack, block_to_upsample = DenseBlock(
                stack,
                n_layers_per_block[n_pool + i + 1],
                growth_rate,
                dropout_p,
                scope='denseblock%d' % (n_pool + i + 2))

        #####################
        #      Softmax      #
        #####################
        net = slim.conv2d(stack,
                          num_classes, [1, 1],
                          activation_fn=None,
                          scope='logits')
        return net
Example #22
0
def embedding(inputs,
              vocab_size,
              num_units,
              zero_pad=True,
              scale=True,
              l2_reg=0.0,
              scope="embedding",
              with_t=False,
              reuse=None):
    """Embeds a given tensor.

  Args:
    inputs: A `Tensor` with type `int32` or `int64` containing the ids to be
      looked up in `lookup table`.
    vocab_size: An int. Vocabulary size.
    num_units: An int. Number of embedding hidden units.
    zero_pad: A boolean. If True, all the values of the fist row (id 0) should
      be constant zeros.
    scale: A boolean. If True. the outputs is multiplied by sqrt num_units.
    l2_reg: L2 regularization weight.
    scope: Optional scope for `variable_scope`.
    with_t: If True, return the embedding table.
    reuse: Boolean, whether to reuse the weights of a previous layer by the
      same name.

  Returns:
    A `Tensor` with one more rank than inputs's. The last dimensionality
      should be `num_units`.

  For example,

  ```
  import tensorflow as tf

  inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
  outputs = embedding(inputs, 6, 2, zero_pad=True)
  with tf.Session() as sess:
      sess.run(tf.global_variables_initializer())
      print sess.run(outputs)
  >>
  [[[ 0.          0.        ]
    [ 0.09754146  0.67385566]
    [ 0.37864095 -0.35689294]]

   [[-1.01329422 -1.09939694]
    [ 0.7521342   0.38203377]
    [-0.04973143 -0.06210355]]]
  ```

  ```
  import tensorflow as tf

  inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3)))
  outputs = embedding(inputs, 6, 2, zero_pad=False)
  with tf.Session() as sess:
      sess.run(tf.global_variables_initializer())
      print sess.run(outputs)
  >>
  [[[-0.19172323 -0.39159766]
    [-0.43212751 -0.66207761]
    [ 1.03452027 -0.26704335]]

   [[-0.11634696 -0.35983452]
    [ 0.50208133  0.53509563]
    [ 1.22204471 -0.96587461]]]
  ```
  """
    with tf.variable_scope(scope, reuse=reuse):
        lookup_table = tf.get_variable(
            "lookup_table",
            dtype=tf.float32,
            shape=[vocab_size, num_units],
            # initializer=tf.contrib.layers.xavier_initializer(),
            regularizer=tf.keras.regularizers.l2(l2_reg))
        if zero_pad:
            lookup_table = tf.concat(
                (tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0)
        outputs = tf.nn.embedding_lookup(lookup_table, inputs)

        if scale:
            outputs = outputs * (num_units**0.5)
    if with_t:
        return outputs, lookup_table
    else:
        return outputs
Example #23
0
def find_trainable_variables(key):
  with tf.variable_scope(key):
    return tf.trainable_variables()
Example #24
0
def build_feature_network(features, config):
    """Build FPN input features.

  Args:
   features: input tensor.
   config: a dict-like config, including all parameters.

  Returns:
    A dict from levels to the feature maps processed after feature network.
  """
    feats = []
    if config.min_level not in features.keys():
        raise ValueError(
            'features.keys ({}) should include min_level ({})'.format(
                features.keys(), config.min_level))

    # Build additional input features that are not from backbone.
    for level in range(config.min_level, config.max_level + 1):
        if level in features.keys():
            feats.append(features[level])
        else:
            # Adds a coarser level by downsampling the last feature map.
            feats.append(
                resample_feature_map(
                    feats[-1],
                    name='p%d' % level,
                    target_width=feats[-1].shape[1] // 2,
                    target_num_channels=config.fpn_num_filters,
                    apply_bn=config.apply_bn_for_resampling,
                    is_training=config.is_training_bn,
                    conv_after_downsample=config.conv_after_downsample,
                    use_native_resize_op=config.use_native_resize_op,
                    pooling_type=config.pooling_type))

    _verify_feats_size(feats,
                       input_size=config.image_size,
                       min_level=config.min_level,
                       max_level=config.max_level)

    with tf.variable_scope('fpn_cells'):
        for rep in range(config.fpn_cell_repeats):
            with tf.variable_scope('cell_{}'.format(rep)):
                logging.info('building cell %d', rep)
                new_feats = build_bifpn_layer(
                    feats=feats,
                    fpn_name=config.fpn_name,
                    fpn_config=config.fpn_config,
                    input_size=config.image_size,
                    fpn_num_filters=config.fpn_num_filters,
                    min_level=config.min_level,
                    max_level=config.max_level,
                    separable_conv=config.separable_conv,
                    is_training=config.is_training_bn,
                    apply_bn_for_resampling=config.apply_bn_for_resampling,
                    conv_after_downsample=config.conv_after_downsample,
                    use_native_resize_op=config.use_native_resize_op,
                    conv_bn_relu_pattern=config.conv_bn_relu_pattern,
                    pooling_type=config.pooling_type)

                feats = [
                    new_feats[level]
                    for level in range(config.min_level, config.max_level + 1)
                ]

                _verify_feats_size(feats,
                                   input_size=config.image_size,
                                   min_level=config.min_level,
                                   max_level=config.max_level)

    return new_feats
Example #25
0
  def construct_model(self,
                      input_tensors=None,
                      prefix='metatrain_',
                      test_num_updates=0):
    """a: training data for inner gradient, b: test data for meta gradient."""

    self.inputa = input_tensors['inputa']
    self.inputb = input_tensors['inputb']
    self.labela = input_tensors['labela']
    self.labelb = input_tensors['labelb']

    with tf.variable_scope('model', reuse=None) as training_scope:
      if 'weights' in dir(self):
        training_scope.reuse_variables()
        weights = self.weights
      else:
        # Define the weights
        self.weights = weights = self.construct_weights()

      # outputbs[i] and lossesb[i] is the output and loss after i+1 gradient
      # updates
      num_updates = max(test_num_updates, FLAGS.num_updates)

      def task_metalearn(inp, reuse=True):
        """Run meta learning."""
        TRAIN = 'train' in prefix  # pylint: disable=invalid-name
        # Perform gradient descent for one task in the meta-batch.
        inputa, inputb, labela, labelb = inp
        task_outputbs, task_lossesb = [], []
        task_msesb = []

        # support_pred and loss, (n_data_per_task, out_dim)
        task_outputa = self.forward(
            inputa, weights, reuse=reuse)  # only not reuse on the first iter
        # labela is (n_data_per_task, out_dim)
        task_lossa = self.loss_func(task_outputa, labela)

        # INNER LOOP (no change with ib)
        grads = tf.gradients(task_lossa, list(weights.values()))
        if FLAGS.stop_grad:
          grads = [tf.stop_gradient(grad) for grad in grads]
        gradients = dict(zip(weights.keys(), grads))
        # theta_pi = theta - alpha * grads
        fast_weights = dict(
            zip(weights.keys(), [
                weights[key] - self.update_lr * gradients[key]
                for key in weights.keys()
            ]))

        # use theta_pi to forward meta-test

        output = self.forward(inputb, weights, reuse=True)
        task_outputbs.append(output)
        # meta-test loss
        task_kl_loss = sum(self.encoder_w.losses)
        task_msesb.append(self.loss_func(output, labelb))
        task_lossesb.append(
            self.loss_func(output, labelb) + self.beta * task_kl_loss)

        def while_body(fast_weights_values):
          """Update params."""
          loss = self.loss_func(
              self.forward(
                  inputa,
                  dict(zip(fast_weights.keys(), fast_weights_values)),
                  reuse=True), labela)
          grads = tf.gradients(loss, fast_weights_values)
          fast_weights_values = [
              v - self.update_lr * g for v, g in zip(fast_weights_values, grads)
          ]
          return fast_weights_values

        fast_weights_values = tf.while_loop(
            lambda _: True,
            while_body,
            loop_vars=[fast_weights.values()],
            maximum_iterations=num_updates - 1,
            back_prop=TRAIN)
        fast_weights = dict(zip(fast_weights.keys(), fast_weights_values))

        output = self.forward(inputb, fast_weights, reuse=True)
        task_outputbs.append(output)
        task_msesb.append(self.loss_func(output, labelb))
        task_lossesb.append(
            self.loss_func(output, labelb) + self.beta * task_kl_loss)
        task_output = [
            task_outputa, task_outputbs, task_lossa, task_lossesb, task_msesb
        ]

        return task_output

      if FLAGS.norm is not None:
        # to initialize the batch norm vars, might want to combine this, and
        # not run idx 0 twice.
        _ = task_metalearn(
            (self.inputa[0], self.inputb[0], self.labela[0], self.labelb[0]),
            False)

      out_dtype = [
          tf.float32, [tf.float32] * 2, tf.float32, [tf.float32] * 2,
          [tf.float32] * 2
      ]
      result = tf.map_fn(task_metalearn, elems=(self.inputa, self.inputb,  \
                                                self.labela, self.labelb), dtype=out_dtype, \
                                                parallel_iterations=FLAGS.meta_batch_size)
      outputas, outputbs, lossesa, lossesb, msesb = result

    ## Performance & Optimization
    if 'train' in prefix:
      # lossesa is length(meta_batch_size)
      self.total_loss1 = tf.reduce_sum(lossesa) / tf.to_float(
          FLAGS.meta_batch_size)
      self.total_losses2 = total_losses2 = [
          tf.reduce_sum(msesb[j]) / tf.to_float(FLAGS.meta_batch_size)
          for j in range(len(msesb))
      ]
      self.total_losses3 = total_losses3 = [
          tf.reduce_sum(lossesb[j]) / tf.to_float(FLAGS.meta_batch_size)
          for j in range(len(lossesb))
      ]
      # after the map_fn
      self.outputas, self.outputbs = outputas, outputbs

      # OUTER LOOP
      if FLAGS.metatrain_iterations > 0:
        optimizer = tf.train.AdamOptimizer(self.meta_lr)
        THETA = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model')  # pylint: disable=invalid-name
        PHI = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoder')  # pylint: disable=invalid-name

        self.gvs_theta = gvs_theta = optimizer.compute_gradients(
            self.total_losses2[-1], THETA)
        metatrain_theta_op = optimizer.apply_gradients(gvs_theta)

        self.gvs_phi = gvs_phi = optimizer.compute_gradients(
            self.total_losses3[-1], PHI)
        metatrain_phi_op = optimizer.apply_gradients(gvs_phi)

        with tf.control_dependencies([metatrain_theta_op, metatrain_phi_op]):
          self.metatrain_op = tf.no_op()

        scale_v = [
            v for v in self.encoder_w.trainable_variables if 'scale' in v.name
        ]
        scale_norm = [tf.reduce_mean(v) for v in scale_v]
        scale_norm = tf.reduce_mean(scale_norm)

        tf.summary.scalar(prefix + 'full_loss', total_losses3[-1])
        tf.summary.scalar(prefix + 'regularizer',
                          total_losses3[-1] - total_losses2[-1])
        tf.summary.scalar(prefix + 'untransformed_scale', scale_norm)

    else:
      self.metaval_total_loss1 = tf.reduce_sum(
          lossesa) / tf.to_float(FLAGS.meta_batch_size)
      self.metaval_total_losses2 = total_losses2 = [
          tf.reduce_sum(msesb[j]) / tf.to_float(FLAGS.meta_batch_size)
          for j in range(len(msesb))
      ]
      self.metaval_total_losses3 = total_losses3 = [
          tf.reduce_sum(lossesb[j]) / tf.to_float(FLAGS.meta_batch_size)
          for j in range(len(lossesb))
      ]

    tf.summary.scalar(prefix + 'Pre-mse', total_losses2[0])
    tf.summary.scalar(prefix + 'Post-mse_' + str(num_updates),
                      total_losses2[-1])
Example #26
0
def build_bifpn_layer(feats, fpn_name, fpn_config, is_training, input_size,
                      fpn_num_filters, min_level, max_level, separable_conv,
                      apply_bn_for_resampling, conv_after_downsample,
                      use_native_resize_op, conv_bn_relu_pattern,
                      pooling_type):
    """Builds a feature pyramid given previous feature pyramid and config."""
    config = fpn_config or get_fpn_config(fpn_name)

    num_output_connections = [0 for _ in feats]
    for i, fnode in enumerate(config.nodes):
        with tf.variable_scope('fnode{}'.format(i)):
            logging.info('fnode %d : %s', i, fnode)
            new_node_width = int(fnode['width_ratio'] * input_size)
            nodes = []
            for idx, input_offset in enumerate(fnode['inputs_offsets']):
                input_node = feats[input_offset]
                num_output_connections[input_offset] += 1
                input_node = resample_feature_map(
                    input_node, '{}_{}_{}'.format(idx, input_offset,
                                                  len(feats)), new_node_width,
                    fpn_num_filters, apply_bn_for_resampling, is_training,
                    conv_after_downsample, use_native_resize_op, pooling_type)
                nodes.append(input_node)

            # Combine all nodes.
            dtype = nodes[0].dtype
            if config.weight_method == 'attn':
                edge_weights = [
                    tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype)
                    for _ in range(len(fnode['inputs_offsets']))
                ]
                normalized_weights = tf.nn.softmax(tf.stack(edge_weights))
                nodes = tf.stack(nodes, axis=-1)
                new_node = tf.reduce_sum(
                    tf.multiply(nodes, normalized_weights), -1)
            elif config.weight_method == 'fastattn':
                edge_weights = [
                    tf.nn.relu(
                        tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype))
                    for _ in range(len(fnode['inputs_offsets']))
                ]
                weights_sum = tf.add_n(edge_weights)
                nodes = [
                    nodes[i] * edge_weights[i] / (weights_sum + 0.0001)
                    for i in range(len(nodes))
                ]
                new_node = tf.add_n(nodes)
            elif config.weight_method == 'sum':
                new_node = tf.add_n(nodes)
            else:
                raise ValueError('unknown weight_method {}'.format(
                    config.weight_method))

            with tf.variable_scope('op_after_combine{}'.format(len(feats))):
                if not conv_bn_relu_pattern:
                    new_node = utils.relu_fn(new_node)

                if separable_conv:
                    conv_op = functools.partial(tf.layers.separable_conv2d,
                                                depth_multiplier=1)
                else:
                    conv_op = tf.layers.conv2d

                new_node = conv_op(
                    new_node,
                    filters=fpn_num_filters,
                    kernel_size=(3, 3),
                    padding='same',
                    use_bias=True if not conv_bn_relu_pattern else False,
                    name='conv')

                new_node = utils.batch_norm_relu(
                    new_node,
                    is_training_bn=is_training,
                    relu=False if not conv_bn_relu_pattern else True,
                    data_format='channels_last',
                    name='bn')

            feats.append(new_node)
            num_output_connections.append(0)

    output_feats = {}
    for l in range(min_level, max_level + 1):
        for i, fnode in enumerate(reversed(config.nodes)):
            if fnode['width_ratio'] == F(l):
                output_feats[l] = feats[-1 - i]
                break
    return output_feats
Example #27
0
    def _build_model(self):
        assert self.mode == 'train' or self.mode == 'eval'
        """Build the core model within the graph."""
        with tf.variable_scope('input'):

            self.x_input = tf.placeholder(tf.float32, shape=[None, 32, 32, 3])

            self.y_input = tf.placeholder(tf.int32, shape=None)

            input_standardized = tf.map_fn(
                lambda img: tf.image.per_image_standardization(img),
                self.x_input)
            x = self._conv('init_conv', input_standardized, 3, 3, 16,
                           self._stride_arr(1))

        strides = [1, 2, 2]
        activate_before_residual = [True, False, False]
        res_func = self._residual

        # Uncomment the following codes to use w28-10 wide residual network.
        # It is more memory efficient than very deep residual network and has
        # comparably good performance.
        # https://arxiv.org/pdf/1605.07146v1.pdf
        filters = [16, 160, 320, 640]

        # Update hps.num_residual_units to 9

        with tf.variable_scope('unit_1_0'):
            x = res_func(x, filters[0], filters[1],
                         self._stride_arr(strides[0]),
                         activate_before_residual[0])
        for i in range(1, 5):
            with tf.variable_scope('unit_1_%d' % i):
                x = res_func(x, filters[1], filters[1], self._stride_arr(1),
                             False)

        with tf.variable_scope('unit_2_0'):
            x = res_func(x, filters[1], filters[2],
                         self._stride_arr(strides[1]),
                         activate_before_residual[1])
        for i in range(1, 5):
            with tf.variable_scope('unit_2_%d' % i):
                x = res_func(x, filters[2], filters[2], self._stride_arr(1),
                             False)

        with tf.variable_scope('unit_3_0'):
            x = res_func(x, filters[2], filters[3],
                         self._stride_arr(strides[2]),
                         activate_before_residual[2])
        for i in range(1, 5):
            with tf.variable_scope('unit_3_%d' % i):
                x = res_func(x, filters[3], filters[3], self._stride_arr(1),
                             False)

        with tf.variable_scope('unit_last'):
            x = self._batch_norm('final_bn', x)
            x = self._relu(x, 0.1)
            x = self._global_avg_pool(x)

        with tf.variable_scope('logit'):
            self.logits = self._fully_connected(x, 10)

        self.predictions = tf.argmax(self.logits, 1, output_type=tf.int32)
        self.correct_prediction = tf.equal(self.predictions, self.y_input)
        self.num_correct = tf.reduce_sum(
            tf.cast(self.correct_prediction, tf.int32))
        self.accuracy = tf.reduce_mean(
            tf.cast(self.correct_prediction, tf.float32))
Example #28
0
def resample_feature_map(feat,
                         name,
                         target_width,
                         target_num_channels,
                         apply_bn=False,
                         is_training=None,
                         conv_after_downsample=False,
                         use_native_resize_op=False,
                         pooling_type=None):
    """Resample input feature map to have target number of channels and width."""

    _, width, _, num_channels = feat.get_shape().as_list()
    if width is None or num_channels is None:
        raise ValueError(
            'shape[1] or shape[3] of feat is None (shape:{}).'.format(
                feat.shape))
    if apply_bn and is_training is None:
        raise ValueError('If BN is applied, need to provide is_training')

    def _maybe_apply_1x1(feat):
        """Apply 1x1 conv to change layer width if necessary."""
        if num_channels != target_num_channels:
            feat = tf.layers.conv2d(feat,
                                    filters=target_num_channels,
                                    kernel_size=(1, 1),
                                    padding='same')
            if apply_bn:
                feat = utils.batch_norm_relu(feat,
                                             is_training_bn=is_training,
                                             relu=False,
                                             data_format='channels_last',
                                             name='bn')
        return feat

    with tf.variable_scope('resample_{}'.format(name)):
        # If conv_after_downsample is True, when downsampling, apply 1x1 after
        # downsampling for efficiency.
        if width > target_width:
            if width % target_width != 0:
                raise ValueError('width ({}) is not divisible by '
                                 'target_width ({}).'.format(
                                     width, target_width))
            if not conv_after_downsample:
                feat = _maybe_apply_1x1(feat)
            stride_size = int(width // target_width)
            if pooling_type == 'max' or pooling_type is None:
                # Use max pooling in default.
                feat = tf.layers.max_pooling2d(
                    inputs=feat,
                    pool_size=stride_size + 1,
                    strides=[stride_size, stride_size],
                    padding='SAME',
                    data_format='channels_last')
            elif pooling_type == 'avg':
                feat = tf.layers.average_pooling2d(
                    inputs=feat,
                    pool_size=stride_size + 1,
                    strides=[stride_size, stride_size],
                    padding='SAME',
                    data_format='channels_last')
            else:
                raise ValueError(
                    'Unknown pooling type: {}'.format(pooling_type))
            if conv_after_downsample:
                feat = _maybe_apply_1x1(feat)
        else:
            if target_width % width != 0:
                raise ValueError('target_width ({}) is not divisible by '
                                 'width ({}).'.format(target_width, width))
            feat = _maybe_apply_1x1(feat)
            if width < target_width:
                _, h, w, _ = feat.get_shape().as_list()
                scale = target_width // width
                if use_native_resize_op:
                    feat = tf.image.resize_nearest_neighbor(
                        feat, [h * scale, w * scale])
                else:
                    feat = nearest_upsampling(feat, scale=scale)

    return feat
Example #29
0
def _get_action_logits(encoder_output,
                       decoder_output,
                       output_vocab_embeddings_table,
                       output_vocab_size,
                       model_config,
                       input_copy_mask=None,
                       use_gating_mechanism=True):
    """Generate output logits given decoder output.

  This effectively combines a Pointer Network (Vinyals et al., 2015) with a
  standard softmax output layer for selecting symbols from an output vocabulary,
  similar to:
      - Jia and Liang, 2016 (https://arxiv.org/abs/1606.03622)
      - Gulcehre et al., 2016 (https://arxiv.org/abs/1603.08148)
      - Gu et al., 2016 (https://arxiv.org/abs/1603.06393)
      - See et al. 2017 (https://arxiv.org/abs/1704.04368)

  Args:
    encoder_output: Tensor representing encoder output of shape (batch size,
      input length, encoder dims).
    decoder_output: Tensor representing decoder output of shape (batch size, #
      decoded steps, decoder dims).
    output_vocab_embeddings_table: Embeddings for output vocabulary of shape
      (output_vocab_size, target embedding dims).
    output_vocab_size: Integer size of output_vocab_embeddings_table outer dim.
    model_config: ModelConfig proto.
    input_copy_mask: Mask of the input sequence for copying.
    use_gating_mechanism: Whether to use gating mechanism.

  Returns:
    Tensor of shape (batch_size, output_vocab_size + input length) representing
    unnormalized logits for both copy and generate actions.
  """
    with tf.variable_scope("logits_transforms"):
        decoder_dims = decoder_output.get_shape()[-1]
        target_embedding_dims = model_config.model_parameters.target_embedding_dims

        # Dot product the decoder output with representations of each of the output
        # symbols to get a set of unnormalized logits for each output vocab item.
        # We need to tile the output vocab embeddings across the batch.
        output_vocab_transform = tf.expand_dims(output_vocab_embeddings_table,
                                                0)
        batch_size = tf.shape(decoder_output)[0]
        output_vocab_transform = tf.tile(output_vocab_transform,
                                         [batch_size, 1, 1])
        # Transform representations to the target_embedding_dims.
        if decoder_dims != target_embedding_dims:
            transformed_decoder_output = common_layers.linear_transform(
                decoder_output, target_embedding_dims, "decoder_transform")
        else:
            transformed_decoder_output = decoder_output
        generate_logits = tf.matmul(transformed_decoder_output,
                                    output_vocab_transform,
                                    transpose_b=True)
        generate_logits_bias = tf.get_variable("generate_logits_bias",
                                               shape=(output_vocab_size))
        generate_logits += generate_logits_bias

        # Dot product the decoder output with representations from the encoder
        # output.
        # This is necessary vs. re-using the encoder-decoder attention weights
        # because those use multihead attention.
        # First, need to transform representations to the decoder dimensions.
        transformed_encoder_output = common_layers.linear_transform(
            encoder_output, decoder_dims, "encoder_transform")

        copy_logits = tf.matmul(decoder_output,
                                transformed_encoder_output,
                                transpose_b=True)
        # This contains scores representing the probability of copying from input
        # (3rd dim) to output (2nd dim).

        # Optionally apply a soft gating mechanism to determine whether
        # to select from copy or generate logits.
        # TODO(petershaw): Evaluate and improve this gating mechanism.
        # The current implementation is most likely not optimal, since it applies
        # a scalar in the range [0,1] prior to softmax.
        if use_gating_mechanism:
            prob_gen_unnormalized = common_layers.linear_transform(
                decoder_output, 1, "prob_gen")
            prob_gen_bias = tf.get_variable("prob_gen_bias", shape=(1))
            prob_gen_unnormalized += prob_gen_bias
            prob_gen = tf.sigmoid(prob_gen_unnormalized)
            # Squeeze so that prob_gen has shape [batch_size, decode_length]
            prob_gen = tf.squeeze(prob_gen, axis=2)

            # These are the 'generate' logits so are scaled by P_gen.
            generate_logits *= tf.expand_dims(prob_gen, axis=-1)
            # These are the 'copy' logits so are scaled by 1 - P_gen.
            copy_logits *= tf.expand_dims(1 - prob_gen, axis=-1)

        if input_copy_mask is not None:
            copy_mask = (1 - tf.dtypes.cast(
                input_copy_mask, dtype=tf.dtypes.float32)) * LOGIT_MASK_VALUE
            copy_logits += tf.expand_dims(copy_mask, axis=1)

        # Concatenate logits into a single vector; first N (fixed) inputs are the
        # generation probabilities, and next are the copy probabilities for each
        # input (well, they aren't really probabilities, but scores.)
        extended_logits = tf.concat([generate_logits, copy_logits], axis=2)
        return extended_logits
Example #30
0
def conv_capsule_mat(input_tensor,
                     input_activation,
                     input_dim,
                     output_dim,
                     layer_name,
                     num_routing=3,
                     num_in_atoms=3,
                     num_out_atoms=3,
                     stride=2,
                     kernel_size=5,
                     min_var=0.0005,
                     final_beta=1.0):
    """Convolutional Capsule layer with Pose Matrices."""
    print('caps conv stride: {}'.format(stride))
    in_atom_sq = num_in_atoms * num_in_atoms
    with tf.variable_scope(layer_name):
        input_shape = tf.shape(input_tensor)
        _, _, _, in_height, in_width = input_tensor.get_shape()
        # This Variable will hold the state of the weights for the layer
        kernel = utils.weight_variable(shape=[
            input_dim, kernel_size, kernel_size, num_in_atoms,
            output_dim * num_out_atoms
        ],
                                       stddev=0.3)
        # kernel = tf.clip_by_norm(kernel, 3.0, axes=[1, 2, 3])
        activation_biases = utils.bias_variable(
            [1, 1, output_dim, 1, 1, 1, 1, 1],
            init_value=0.5,
            name='activation_biases')
        sigma_biases = utils.bias_variable([1, 1, output_dim, 1, 1, 1, 1, 1],
                                           init_value=.5,
                                           name='sigma_biases')
        with tf.name_scope('conv'):
            print('convi;')
            # input_tensor: [x,128,8, c1,c2] -> [x*128,8, c1,c2]
            print(input_tensor.get_shape())
            input_tensor_reshaped = tf.reshape(input_tensor, [
                input_shape[0] * input_dim * in_atom_sq, input_shape[3],
                input_shape[4], 1
            ])
            input_tensor_reshaped.set_shape((None, input_tensor.get_shape()[3],
                                             input_tensor.get_shape()[4], 1))
            input_act_reshaped = tf.reshape(input_activation, [
                input_shape[0] * input_dim, input_shape[3], input_shape[4], 1
            ])
            input_act_reshaped.set_shape((None, input_tensor.get_shape()[3],
                                          input_tensor.get_shape()[4], 1))
            print(input_tensor_reshaped.get_shape())
            # conv: [x*128,out*out_at, c3,c4]
            conv_patches = tf.extract_image_patches(
                images=input_tensor_reshaped,
                ksizes=[1, kernel_size, kernel_size, 1],
                strides=[1, stride, stride, 1],
                rates=[1, 1, 1, 1],
                padding='VALID',
            )
            act_patches = tf.extract_image_patches(
                images=input_act_reshaped,
                ksizes=[1, kernel_size, kernel_size, 1],
                strides=[1, stride, stride, 1],
                rates=[1, 1, 1, 1],
                padding='VALID',
            )
            o_height = (in_height - kernel_size) // stride + 1
            o_width = (in_width - kernel_size) // stride + 1
            patches = tf.reshape(conv_patches,
                                 (input_shape[0], input_dim, in_atom_sq,
                                  o_height, o_width, kernel_size, kernel_size))
            patches.set_shape((None, input_dim, in_atom_sq, o_height, o_width,
                               kernel_size, kernel_size))
            patch_trans = tf.transpose(patches, [1, 5, 6, 0, 3, 4, 2])
            patch_split = tf.reshape(
                patch_trans,
                (input_dim, kernel_size, kernel_size, input_shape[0] *
                 o_height * o_width * num_in_atoms, num_in_atoms))
            patch_split.set_shape(
                (input_dim, kernel_size, kernel_size, None, num_in_atoms))
            a_patches = tf.reshape(act_patches,
                                   (input_shape[0], input_dim, 1, 1, o_height,
                                    o_width, kernel_size, kernel_size))
            a_patches.set_shape((None, input_dim, 1, 1, o_height, o_width,
                                 kernel_size, kernel_size))
            with tf.name_scope('input_act'):
                utils.activation_summary(
                    tf.reduce_sum(tf.reduce_sum(tf.reduce_sum(a_patches,
                                                              axis=1),
                                                axis=-1),
                                  axis=-1))
            with tf.name_scope('Wx'):
                wx = tf.matmul(patch_split, kernel)
                wx = tf.reshape(wx, (input_dim, kernel_size, kernel_size,
                                     input_shape[0], o_height, o_width,
                                     num_in_atoms * num_out_atoms, output_dim))
                wx.set_shape(
                    (input_dim, kernel_size, kernel_size, None, o_height,
                     o_width, num_in_atoms * num_out_atoms, output_dim))
                wx = tf.transpose(wx, [3, 0, 7, 6, 4, 5, 1, 2])
                utils.activation_summary(wx)

        with tf.name_scope('routing'):
            # Routing
            # logits: [x, 128, 10, c3, c4]
            logit_shape = [
                input_dim, output_dim, 1, o_height, o_width, kernel_size,
                kernel_size
            ]
            activation, center = update_conv_routing(
                wx=wx,
                input_activation=a_patches,
                activation_biases=activation_biases,
                sigma_biases=sigma_biases,
                logit_shape=logit_shape,
                num_out_atoms=num_out_atoms * num_out_atoms,
                input_dim=input_dim,
                num_routing=num_routing,
                output_dim=output_dim,
                min_var=min_var,
                final_beta=final_beta,
            )
            # activations: [x, 10, 8, c3, c4]

        out_activation = tf.squeeze(activation, axis=[1, 3, 6, 7])
        out_center = tf.squeeze(center, axis=[1, 6, 7])
        with tf.name_scope('center'):
            utils.activation_summary(out_center)
        return tf.sigmoid(out_activation), out_center