Esempio n. 1
0
def optimize(args):
    """    Gatys et al. CVPR 2017
    ref: Image Style Transfer Using Convolutional Neural Networks
    """
    if args.cuda:
        ctx = mx.gpu(0)
    else:
        ctx = mx.cpu(0)
    # load the content and style target
    content_image = utils.tensor_load_rgbimage(args.content_image,ctx, size=args.content_size, keep_asp=True)
    content_image = utils.subtract_imagenet_mean_preprocess_batch(content_image)
    style_image = utils.tensor_load_rgbimage(args.style_image, ctx, size=args.style_size)
    style_image = utils.subtract_imagenet_mean_preprocess_batch(style_image)
    # load the pre-trained vgg-16 and extract features
    vgg = net.Vgg16()
    utils.init_vgg_params(vgg, 'models', ctx=ctx)
    # content feature
    f_xc_c = vgg(content_image)[1]
    # style feature
    features_style = vgg(style_image)
    gram_style = [net.gram_matrix(y) for y in features_style]
    # output
    output = Parameter('output', shape=content_image.shape)
    output.initialize(ctx=ctx)
    output.set_data(content_image)
    # optimizer
    trainer = gluon.Trainer([output], 'adam',
                            {'learning_rate': args.lr})
    mse_loss = gluon.loss.L2Loss()

    # optimizing the images
    for e in range(args.iters):
        utils.imagenet_clamp_batch(output.data(), 0, 255)
        # fix BN for pre-trained vgg
        with autograd.record():
            features_y = vgg(output.data())
            content_loss = 2 * args.content_weight * mse_loss(features_y[1], f_xc_c)
            style_loss = 0.
            for m in range(len(features_y)):
                gram_y = net.gram_matrix(features_y[m])
                gram_s = gram_style[m]
                style_loss = style_loss + 2 * args.style_weight * mse_loss(gram_y, gram_s)
            total_loss = content_loss + style_loss
            total_loss.backward()

        trainer.step(1)
        if (e + 1) % args.log_interval == 0:
            print('loss:{:.2f}'.format(total_loss.asnumpy()[0]))
        
    # save the image
    output = utils.add_imagenet_mean_batch(output.data())
    utils.tensor_save_bgrimage(output[0], args.output_image, args.cuda)
Esempio n. 2
0
class LearnedPositionalEmbedding(HybridBlock):
    def __init__(self,
                 units,
                 max_length,
                 mode='clip',
                 dtype='float32',
                 weight_initializer=None):
        super().__init__()
        self._units = units
        self._dtype = dtype
        self._max_length = max_length
        self._mode = mode

        self.weight = Parameter('weight',
                                shape=(max_length, units),
                                init=weight_initializer,
                                dtype=dtype,
                                allow_deferred_init=True)

    def __repr__(self):
        s = '{name}(units={units}, max_length={max_length}, mode={mode}, dtype={dtype})'
        return s.format(name=self.__class__.__name__,
                        units=self._units,
                        max_length=self._max_length,
                        mode=self._mode,
                        dtype=self._dtype)

    def forward(self, positions):
        return np.take(self.weight.data(), positions, axis=0, mode=self._mode)
Esempio n. 3
0
class RMSNorm(HybridBlock):
    """Apply root mean square layer normalization to n-dimensional input array, 
    where we do not substract mean in the numerator. 

    For more details, see the paper: https://arxiv.org/pdf/1910.07467.pdf
    """
    def __init__(self,
                 in_channels,
                 center=True,
                 scale=True,
                 beta_initializer='zeros',
                 gamma_initializer='ones',
                 variance_epsilon=1E-6,
                 dtype='float32',
                 **kwargs):
        super().__init__()
        self._kwargs = {'center': center, 'scale': scale}
        self._in_channels = in_channels
        self._epsilon = variance_epsilon
        self.gamma = Parameter('gamma',
                               grad_req='write' if scale else 'null',
                               shape=(in_channels, ),
                               init=gamma_initializer,
                               dtype=dtype)
        self.beta = Parameter('beta',
                              grad_req='write' if center else 'null',
                              shape=(in_channels, ),
                              init=beta_initializer,
                              dtype=dtype)

    def forward(self, data):
        var = np.power(data, 2).mean(-1, keepdims=True)
        data = data * np.reciprocal(np.sqrt(var + self._epsilon))
        return data * self.gamma.data() + self.beta.data()

    def __repr__(self):
        s = '{name}({content}'
        in_channels = self.gamma.shape[0]
        s += ', in_channels={0}'.format(in_channels)
        s += ')'
        return s.format(name=self.__class__.__name__,
                        content=', '.join([
                            '='.join([k, v.__repr__()])
                            for k, v in self._kwargs.items()
                        ]))
Esempio n. 4
0
def check_gradient(forward_fn,
                   fn_params: List[mx.ndarray.NDArray],
                   wrt: Parameter,
                   seed=None,
                   eps=3e-4,
                   tol=1e-2) -> bool:
    """
    Check autograd backward for a given function using finite differencing.

    :param forward_fn: The function to test the gradients of. This function should return a scalar.
    :param fn_params: A list of parameters to call the function.
    :param wrt: The parameter with respect to which we take the gradient.
    :param seed: Random seed for mxnet and numpy. Note that the forward function might be stochastic. We reinitialize
        the seed to the same number before every forward function call.
    :param eps: Epsilon used in finite differencing. The default value is taken from theano's verify_grad function.
    :param tol: Absolute and relative tolerance used to check equality. Again, the default value is taken from theano's
        verify_grad function.
    :return: True if check succeeds.
    """
    if seed is None:
        seed = int(np.random.rand() * 1e6)

    # calculate gradient with autograd
    mx.random.seed(seed)
    np.random.seed(seed)
    with autograd.record():
        out = forward_fn(*fn_params)

    autograd.backward(out)
    ag_grad = wrt.grad().asnumpy()

    # calculate gradient by finite difference
    orig_data = wrt.data().asnumpy()
    fd_grad = np.zeros_like(orig_data)
    for i in range(orig_data.size):
        ix = np.unravel_index(i, orig_data.shape)

        # f(x + h)
        orig_data[ix] += eps
        wrt.set_data(orig_data)
        mx.random.seed(seed)
        np.random.seed(seed)
        out_ph = forward_fn(*fn_params).asscalar()

        # f(x - h)
        orig_data[ix] -= (2 * eps)
        wrt.set_data(orig_data)
        mx.random.seed(seed)
        np.random.seed(seed)
        out_mh = forward_fn(*fn_params).asscalar()
        orig_data[ix] += eps  # revert

        # calc gradient
        fd_grad[ix] = (out_ph - out_mh) / (2 * eps)

    return np.allclose(ag_grad, fd_grad, atol=tol, rtol=tol)
def print_top_words(weight: gluon.Parameter,
                    id2word: dict,
                    top: int = 10) -> None:
    n_factors, vocab_size = weight.shape
    weight = weight.data().asnumpy()
    for factor_idx in range(n_factors):
        top_word_indices = np.argsort(weight[factor_idx])[::-1][0:top]
        logger.info('----------')
        logger.info('factor %d:' % factor_idx)
        for word_idx in top_word_indices:
            logger.info('%.3e\t%s' %
                        (weight[factor_idx, word_idx], id2word[word_idx]))
def print_nearest_cosine_distance(embeddings: gluon.Parameter,
                                  id2word: dict,
                                  num: int = 10) -> None:
    embeddings = embeddings.data().asnumpy().T
    top_wordids = list(id2word.keys())[0:num]
    distances = sklearn.metrics.pairwise.cosine_similarity(
        embeddings[top_wordids], embeddings)
    for idx, distance in zip(top_wordids, distances):
        top_word_indices = np.argsort(distance)[::-1][1:11]
        logger.info('----------')
        logger.info("nearest words in cosine distance to: %s" % id2word[idx])
        for nearest in top_word_indices:
            logger.info('%.3e\t%s' % (distance[nearest], id2word[nearest]))
Esempio n. 7
0
class BucketPositionalEmbedding(HybridBlock):
    """Divide the positional space into buckets and assign the relative positions within each
    bucket to the same value. For positions that are out-of-the-boundary, they are treated as
    falling into one bucket.

    This is used in the T5 paper:
    "[Arxiv2019] Exploring the limits of transfer learning with a unified text-to-text transformer",

    Here, the first half of the buckets handles the small shifts and the second half
    of the buckets handles the large shifts (mapping them in logarithmically separated bins).
    """
    def __init__(self,
                 units,
                 bidirectional=True,
                 num_buckets=32,
                 max_distance=128,
                 dtype='float32',
                 embed_initializer=None):
        super().__init__()
        self._units = units
        self._bidirectional = bidirectional
        self._num_buckets = num_buckets
        self._max_distance = max_distance
        self._dtype = dtype
        self.weight = Parameter('weight',
                                shape=(num_buckets, units),
                                init=embed_initializer,
                                dtype=dtype,
                                allow_deferred_init=True)

    def __repr__(self):
        s = '{name}(units={units}, bidirectional={bidirectional}, num_buckets={num_buckets},' \
            ' max_distance={max_distance}, dtype={dtype})'
        return s.format(name=self.__class__.__name__,
                        units=self._units,
                        bidirectional=self._bidirectional,
                        num_buckets=self._num_buckets,
                        max_distance=self._max_distance,
                        dtype=self._dtype)

    def forward(self, relative_positions):
        buckets = relative_position_bucket(relative_positions,
                                           bidirectional=self._bidirectional,
                                           num_buckets=self._num_buckets,
                                           max_distance=self._max_distance)
        return np.take(self.weight.data(), buckets, axis=0)
Esempio n. 8
0
class NoNorm(HybridBlock):
    r"""
    Apply an element-wise linear transformation to the n-dimensional input array.
    replacing the layer normalization.

    .. math::
        out = \gmmma \circ data + \beta

    Parameters
    ----------
    in_channels : int
        Number of channels (feature maps) in input data. If not specified,
        initialization will be deferred to the first time `forward` is called
    center: bool, default True
        If True, add offset of `beta` to normalized tensor.
        If False, `beta` is ignored.
    scale: bool, default True
        If True, multiply by `gamma`. If False, `gamma` is not used.
    beta_initializer: str or `Initializer`, default 'zeros'
        Initializer for the beta weight.
    gamma_initializer: str or `Initializer`, default 'ones'
        Initializer for the gamma weight.

    Inputs:
        - **data**: input tensor with arbitrary shape.

    Outputs:
        - **out**: output tensor with the same shape as `data`.

    References
    ----------
        `MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices
        <https://arxiv.org/pdf/2004.02984.pdf>`_

    Examples
    --------
    >>> # Input of shape (2, 5)
    >>> x = mx.np.array([[1, 2, 3, 4, 5], [1, 1, 2, 2, 2]])
    >>> # Layer normalization is calculated with the above formula
    >>> layer = NoNorm(in_channels=5)
    >>> layer.initialize(ctx=mx.cpu(0))
    >>> layer(x)
    array([[1., 2., 3., 4., 5.],
       [1., 1., 2., 2., 2.]])
    """
    def __init__(self,
                 in_channels,
                 center=True,
                 scale=True,
                 beta_initializer='zeros',
                 gamma_initializer='ones',
                 **kwargs):
        super().__init__(**kwargs)
        self._kwargs = {'center': center, 'scale': scale}
        self._in_channels = in_channels
        self.gamma = Parameter('gamma',
                               grad_req='write' if scale else 'null',
                               shape=(in_channels, ),
                               init=gamma_initializer)
        self.beta = Parameter('beta',
                              grad_req='write' if center else 'null',
                              shape=(in_channels, ),
                              init=beta_initializer)

    def forward(self, data):
        return data * self.gamma.data() + self.beta.data()

    def __repr__(self):
        s = '{name}({content}'
        in_channels = self.gamma.shape[0]
        s += ', in_channels={0}'.format(in_channels)
        s += ')'
        return s.format(name=self.__class__.__name__,
                        content=', '.join([
                            '='.join([k, v.__repr__()])
                            for k, v in self._kwargs.items()
                        ]))
Esempio n. 9
0
class TransformerXLDecoder(HybridBlock):
    def __init__(self,
                 num_layers=3,
                 units=512,
                 hidden_size=2048,
                 num_heads=8,
                 activation_dropout=0.1,
                 dropout=0.1,
                 attention_dropout=0.0,
                 layernorm_eps=1E-12,
                 activation='relu',
                 dtype='float32',
                 layout='NT',
                 pre_norm=False,
                 weight_initializer=None,
                 bias_initializer=None):
        super().__init__()
        self.query_k_bias = Parameter('query_k_bias',
                                      shape=(num_heads, units // num_heads),
                                      init=bias_initializer,
                                      allow_deferred_init=True)
        self.query_r_bias = Parameter('query_r_bias',
                                      shape=(num_heads, units // num_heads),
                                      init=bias_initializer,
                                      allow_deferred_init=True)
        self.decoder_layers = nn.HybridSequential()
        for i in range(num_layers):
            self.decoder_layers.add(
                TransformerXLDecoderLayer(
                    units=units,
                    hidden_size=hidden_size,
                    num_heads=num_heads,
                    activation_dropout=activation_dropout,
                    dropout=dropout,
                    attention_dropout=attention_dropout,
                    layer_norm_eps=layernorm_eps,
                    activation=activation,
                    dtype=dtype,
                    layout=layout,
                    pre_norm=pre_norm,
                    weight_initializer=weight_initializer,
                    bias_initializer=bias_initializer))

    def forward(self, data, mem_l, rel_positions, mask):
        """

        Parameters
        ----------
        F
        data
            - layout = 'NT':
                Shape (batch_size, query_length)
            - layout = 'TN':
                Shape (query_length, batch_size)
        mem_l
            Contains a list of memory objects, each one will contain:
            - layout = 'NT':
                Shape (batch_size, mem_length, C_i)
            - layout = 'TN':
                Shape (mem_length, batch_size, C_i)
        rel_positions
            The relative positions.
            Shape (query_length, mem_length + query_length)
        mask
            Mask between the query and the memory + query.
            Shape (batch_size, query_length, mem_length + query_length)

        Returns
        -------
        out_l
            Contains a list of hidden states, each will contain:
            - layout = 'NT'
                Shape (batch_size, query_length, C_o)
            - layout = 'TN'
                Shape (query_length, batch_size, C_o)
        """
        query_k_bias = self.query_k_bias.data()
        query_r_bias = self.query_r_bias.data()
        out_l = []
        out = data
        for i, layer in enumerate(self.decoder_layers):
            out = layer(out, mem_l[i], rel_positions, mask, query_r_bias,
                        query_k_bias)
            out_l.append(out)
        return out_l