Beispiel #1
0
def optimize(args):
    """    Gatys et al. CVPR 2017
    ref: Image Style Transfer Using Convolutional Neural Networks
    """
    if args.cuda:
        ctx = mx.gpu(0)
    else:
        ctx = mx.cpu(0)
    # load the content and style target
    content_image = utils.tensor_load_rgbimage(args.content_image,ctx, size=args.content_size, keep_asp=True)
    content_image = utils.subtract_imagenet_mean_preprocess_batch(content_image)
    style_image = utils.tensor_load_rgbimage(args.style_image, ctx, size=args.style_size)
    style_image = utils.subtract_imagenet_mean_preprocess_batch(style_image)
    # load the pre-trained vgg-16 and extract features
    vgg = net.Vgg16()
    utils.init_vgg_params(vgg, 'models', ctx=ctx)
    # content feature
    f_xc_c = vgg(content_image)[1]
    # style feature
    features_style = vgg(style_image)
    gram_style = [net.gram_matrix(y) for y in features_style]
    # output
    output = Parameter('output', shape=content_image.shape)
    output.initialize(ctx=ctx)
    output.set_data(content_image)
    # optimizer
    trainer = gluon.Trainer([output], 'adam',
                            {'learning_rate': args.lr})
    mse_loss = gluon.loss.L2Loss()

    # optimizing the images
    for e in range(args.iters):
        utils.imagenet_clamp_batch(output.data(), 0, 255)
        # fix BN for pre-trained vgg
        with autograd.record():
            features_y = vgg(output.data())
            content_loss = 2 * args.content_weight * mse_loss(features_y[1], f_xc_c)
            style_loss = 0.
            for m in range(len(features_y)):
                gram_y = net.gram_matrix(features_y[m])
                gram_s = gram_style[m]
                style_loss = style_loss + 2 * args.style_weight * mse_loss(gram_y, gram_s)
            total_loss = content_loss + style_loss
            total_loss.backward()

        trainer.step(1)
        if (e + 1) % args.log_interval == 0:
            print('loss:{:.2f}'.format(total_loss.asnumpy()[0]))
        
    # save the image
    output = utils.add_imagenet_mean_batch(output.data())
    utils.tensor_save_bgrimage(output[0], args.output_image, args.cuda)
Beispiel #2
0
    def __init__(
        self,
        d_hidden: int,
        kernel_sizes: List[int],
        n_head: int = 1,
        bias: bool = True,
        bidirectional: bool = False,
        dist_enc: Optional[str] = None,
        share_values: bool = False,
        dropout: float = 0.0,
        temperature: float = 1.0,
        **kwargs,
    ):
        """
        Self-attention module with q,k,v from the same input

        Parameters
        ----------
        d_hidden : int
            hidden dimension
        kernel_sizes: int
            kernel sizes of convolutions to generate queries and keys
        n_head : int, optional
            number of attention heads, by default 1
        bias : bool, optional
            add bias term in input and output projections, by default True
        bidirectional : bool, optional
            if False, add a mask to avoid backward attention, by default False
        dist_enc : Optional[str], optional
            add relative distance embeddings to dot-product attention, can be 
                'add' (linearly combine key and dist),
                'dot' (dot product between key and dist), 
                or None (disabled),
            by default None
        share_values : bool, optional
            if True, a value reprensentation is shared by all attention heads, by default False
            ref. https://arxiv.org/abs/1912.09363
        dropout : float, optional
            dropout rate, by default 0.0
        temperature : float, optional
            softmax temperature, by default 1.0
        """
        super(SelfAttention, self).__init__(**kwargs)
        n_groups = len(kernel_sizes)
        assert (
            d_hidden % n_head == 0
        ), f"hidden dim {d_hidden} cannot be split into {n_head} heads."
        assert (
            d_hidden % n_groups == 0
        ), f"hidden dim {d_hidden} cannot be split into {n_groups} groups."
        assert (
            n_head % n_groups == 0
        ), f"num_heads {n_heads} cannot be allocated for {n_groups} groups."
        self.d_hidden = d_hidden
        self.kernel_sizes = kernel_sizes
        self.n_groups = n_groups
        self.d_group = self.d_hidden // self.n_groups
        self.n_head = n_head
        self.d_head = self.d_hidden // self.n_head
        self.bias = bias
        self.dist_enc = dist_enc
        self.bidirectional = bidirectional
        self.share_values = share_values
        self.temperature = temperature

        with self.name_scope():
            self.qk_proj = HybridConcurrent(axis=-1, prefix="qk_proj_")
            for ksize in self.kernel_sizes:
                self.qk_proj.add(
                    CausalConv1D(
                        channels=self.d_group * 2,
                        kernel_size=ksize,
                        prefix=f"conv{ksize}_",
                    ))
            self.v_proj = nn.Dense(
                units=self.d_head if self.share_values else d_hidden,
                use_bias=bias,
                flatten=False,
                weight_initializer=init.Xavier(),
                prefix="v_proj_",
            )
            self.out_proj = nn.Dense(
                units=d_hidden,
                use_bias=bias,
                flatten=False,
                weight_initializer=init.Xavier(),
                prefix="out_proj_",
            )

            if self.dist_enc is not None:
                assert self.dist_enc in [
                    "dot",
                    "add",
                ], f"distance encoding type {self.dist_enc} is not supported"
                self.posemb = SinusoidalPositionalEmbedding(d_hidden)
                self.pos_proj = nn.Dense(
                    units=d_hidden,
                    use_bias=bias,
                    flatten=False,
                    weight_initializer=init.Xavier(),
                    prefix="pos_proj_",
                )
                if self.dist_enc == "add":
                    self._ctt_bias_weight = Parameter(
                        "_ctt_bias_weight",
                        shape=(1, n_head, 1, self.d_head),
                        init=init.Xavier(),
                    )
                    self._pos_bias_weight = Parameter(
                        "_pos_bias_weight",
                        shape=(1, n_head, 1, self.d_head),
                        init=init.Xavier(),
                    )

            self.dropout = nn.Dropout(dropout)
Beispiel #3
0
class NoNorm(HybridBlock):
    r"""
    Apply an element-wise linear transformation to the n-dimensional input array.
    replacing the layer normalization.

    .. math::
        out = \gmmma \circ data + \beta

    Parameters
    ----------
    in_channels : int
        Number of channels (feature maps) in input data. If not specified,
        initialization will be deferred to the first time `forward` is called
    center: bool, default True
        If True, add offset of `beta` to normalized tensor.
        If False, `beta` is ignored.
    scale: bool, default True
        If True, multiply by `gamma`. If False, `gamma` is not used.
    beta_initializer: str or `Initializer`, default 'zeros'
        Initializer for the beta weight.
    gamma_initializer: str or `Initializer`, default 'ones'
        Initializer for the gamma weight.

    Inputs:
        - **data**: input tensor with arbitrary shape.

    Outputs:
        - **out**: output tensor with the same shape as `data`.

    References
    ----------
        `MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices
        <https://arxiv.org/pdf/2004.02984.pdf>`_

    Examples
    --------
    >>> # Input of shape (2, 5)
    >>> x = mx.np.array([[1, 2, 3, 4, 5], [1, 1, 2, 2, 2]])
    >>> # Layer normalization is calculated with the above formula
    >>> layer = NoNorm(in_channels=5)
    >>> layer.initialize(ctx=mx.cpu(0))
    >>> layer(x)
    array([[1., 2., 3., 4., 5.],
       [1., 1., 2., 2., 2.]])
    """
    def __init__(self,
                 in_channels,
                 center=True,
                 scale=True,
                 beta_initializer='zeros',
                 gamma_initializer='ones',
                 dtype='float32',
                 **kwargs):
        super().__init__(**kwargs)
        self._kwargs = {'center': center, 'scale': scale}
        self._in_channels = in_channels
        self.gamma = Parameter('gamma',
                               grad_req='write' if scale else 'null',
                               shape=(in_channels, ),
                               init=gamma_initializer,
                               dtype=dtype)
        self.beta = Parameter('beta',
                              grad_req='write' if center else 'null',
                              shape=(in_channels, ),
                              init=beta_initializer,
                              dtype=dtype)

    def forward(self, data):
        return data * self.gamma.data() + self.beta.data()

    def __repr__(self):
        s = '{name}({content}'
        in_channels = self.gamma.shape[0]
        s += ', in_channels={0}'.format(in_channels)
        s += ')'
        return s.format(name=self.__class__.__name__,
                        content=', '.join([
                            '='.join([k, v.__repr__()])
                            for k, v in self._kwargs.items()
                        ]))
Beispiel #4
0
    def __init__(self,
                 vocab_size: int,
                 embed_size: int,
                 units: int,
                 cutoffs: Optional[Union[int, List]] = None,
                 div_val: float = 1.0,
                 dtype='float32',
                 scaled=True,
                 embedding_initializer: InitializerType = None,
                 weight_initializer: InitializerType = None):
        """

        Parameters
        ----------
        vocab_size
            The size of the vocabulary
        embed_size
            The base size of the embedding vectors. The embedding size of each cluster will be
            [embed_size / div_val**0, embed_size / div_val**1, embed_size / div_val**2, ...]
        units
            The number of units after the mapping
        cutoffs
            The cutoffs to slice the vocab to multiple clusters. It should be a sorted list. Each
            value should be between 1 --> vocab_size - 1.
        div_val
            The base denominator for computing the size of the embedding vector in each cluster.
        dtype
            The data type of layer
        scaled
            Whether to scale the embedding by sqrt(units)
        embedding_initializer
            Initializer of the embedding vectors
        weight_initializer
            Initializer of projection layers
        bias_initializer
            Initializer of the bias
        """
        super().__init__()
        cutoffs = _fmt_and_check_cutoffs(cutoffs, vocab_size)
        if cutoffs is None:
            assert div_val == 1.0
        self._dtype = dtype
        self._kwargs = OrderedDict([('cutoffs', cutoffs),
                                    ('vocab_size', vocab_size),
                                    ('embed_size', embed_size),
                                    ('units', units), ('div_val', div_val),
                                    ('dtype', dtype), ('scaled', scaled)])
        self._vocab_size = vocab_size
        self._cutoffs = cutoffs
        self._units = units
        self._embed_size = embed_size
        self._div_val = div_val
        self._scaled = scaled
        if self._scaled:
            self._emb_scale = units**0.5
        if div_val == 1.0:
            self.embed0_weight = Parameter('embed0_weight',
                                           shape=(vocab_size, embed_size),
                                           init=embedding_initializer,
                                           allow_deferred_init=True)

            if units != embed_size:
                self.inter_proj0_weight = Parameter('inter_proj0_weight',
                                                    shape=(embed_size, units),
                                                    init=weight_initializer,
                                                    allow_deferred_init=True)
            else:
                self.proj_layers = None
        else:
            self.proj_layers = nn.HybridSequential()
            for i, (l_idx, r_idx) in enumerate(
                    zip([0] + cutoffs, cutoffs + [vocab_size])):
                inner_embed_size = int(embed_size / div_val**i)
                if inner_embed_size == 0:
                    raise ValueError(
                        'div_val = {} is too large for the layer. Currently, the '
                        'cutoffs are {} and the embed_size is {}. Using the '
                        'div_val = {} will cause some clusters to have '
                        'embed_size=0.'.format(div_val, cutoffs, embed_size,
                                               div_val))
                setattr(
                    self, 'embed{}_weight'.format(i),
                    Parameter('embed{}_weight'.format(i),
                              shape=(r_idx - l_idx, inner_embed_size),
                              init=embedding_initializer,
                              allow_deferred_init=True))
                setattr(
                    self, 'inter_proj{}_weight'.format(i),
                    Parameter('inter_proj{}_weight'.format(i),
                              shape=(inner_embed_size, units),
                              init=weight_initializer,
                              allow_deferred_init=True))
Beispiel #5
0
 def __init__(self, d_model, epsilon, dtype): 
     super().__init__()
     self.gemma = Parameter('layernorm_weight', shape=d_model, init='ones', dtype=dtype)
     self.variance_epsilon = epsilon
Beispiel #6
0
class TransformerXLDecoder(HybridBlock):
    def __init__(self,
                 num_layers=3,
                 units=512,
                 hidden_size=2048,
                 num_heads=8,
                 activation_dropout=0.1,
                 dropout=0.1,
                 attention_dropout=0.0,
                 layernorm_eps=1E-12,
                 activation='relu',
                 dtype='float32',
                 layout='NT',
                 pre_norm=False,
                 weight_initializer=None,
                 bias_initializer=None):
        super().__init__()
        self.query_k_bias = Parameter('query_k_bias',
                                      shape=(num_heads, units // num_heads),
                                      init=bias_initializer,
                                      allow_deferred_init=True)
        self.query_r_bias = Parameter('query_r_bias',
                                      shape=(num_heads, units // num_heads),
                                      init=bias_initializer,
                                      allow_deferred_init=True)
        self.decoder_layers = HybridSequential()
        for i in range(num_layers):
            self.decoder_layers.add(
                TransformerXLDecoderLayer(
                    units=units,
                    hidden_size=hidden_size,
                    num_heads=num_heads,
                    activation_dropout=activation_dropout,
                    dropout=dropout,
                    attention_dropout=attention_dropout,
                    layer_norm_eps=layernorm_eps,
                    activation=activation,
                    dtype=dtype,
                    layout=layout,
                    pre_norm=pre_norm,
                    weight_initializer=weight_initializer,
                    bias_initializer=bias_initializer))

    def forward(self, data, mem_l, rel_positions, mask):
        """

        Parameters
        ----------
        F
        data
            - layout = 'NT':
                Shape (batch_size, query_length)
            - layout = 'TN':
                Shape (query_length, batch_size)
        mem_l
            Contains a list of memory objects, each one will contain:
            - layout = 'NT':
                Shape (batch_size, mem_length, C_i)
            - layout = 'TN':
                Shape (mem_length, batch_size, C_i)
        rel_positions
            The relative positions.
            Shape (query_length, mem_length + query_length)
        mask
            Mask between the query and the memory + query.
            Shape (batch_size, query_length, mem_length + query_length)

        Returns
        -------
        out_l
            Contains a list of hidden states, each will contain:
            - layout = 'NT'
                Shape (batch_size, query_length, C_o)
            - layout = 'TN'
                Shape (query_length, batch_size, C_o)
        """
        query_k_bias = self.query_k_bias.data()
        query_r_bias = self.query_r_bias.data()
        out_l = []
        out = data
        for i, layer in enumerate(self.decoder_layers):
            out = layer(out, mem_l[i], rel_positions, mask, query_r_bias,
                        query_k_bias)
            out_l.append(out)
        return out_l