Ejemplo n.º 1
0
    def forward(self, input):
        out1 = self.block1(input)

        att2 = self.attn(input)
        att2 = self.gamma.data() * att2

        out = (input + out1) * (FF.ones_like(out1) + att2)
        return out
Ejemplo n.º 2
0
 def test_getitem_autograd(np_array, index):
     x = np.array(np_array, dtype=np_array.dtype)
     x.attach_grad()
     with autograd.record():
         y = x[index]
     y.backward()
     value = np.ones_like(y)
     x_grad = np.zeros_like(x)
     x_grad[index] = value
     assert same(x_grad.asnumpy(), x.grad.asnumpy())
Ejemplo n.º 3
0
 def forward(self, lengths):
     if self.alpha == 0.0:
         if isinstance(lengths, (int, float)):
             return 1.0
         else:
             return np.ones_like(lengths)
     else:
         numerator = self.beta + lengths if self.beta != 0.0 else lengths
         numerator = numerator ** self.alpha if self.alpha != 1.0 else numerator
         return numerator / self.denominator
Ejemplo n.º 4
0
    def forward(self, input):

        # =========== UNet branch ===========
        out10 = self.conv_init_1(input)
        out1 = self.compr11(out10)
        out1 = FFx.relu(out1)
        out1 = self.compr12(out1)
        out1 = FFx.relu(out1)
        out1 = self.expand1(out1, out10)
        out1 = FFx.relu(out1)

        # =========== \capNet branch ===========

        out20 = self.conv_init_2(input)
        out2 = self.expand2(out20)
        out2 = FFx.relu(out2)
        out2 = self.compr21(out2)
        out2 = FFx.relu(out2)
        out2 = self.compr22(FF.concatenate([out2, out20], axis=1))
        out2 = FFx.relu(out2)

        att = self.gamma1.data() * self.att(input)
        ratt122 = self.gamma2.data() * self.ratt122(out1, out2, out2)
        ratt211 = self.gamma3.data() * self.ratt211(out2, out1, out1)

        ones1 = FF.ones_like(out10)
        ones2 = FF.ones_like(input)

        # Enhanced output of 1, based on memory of 2
        out122 = out1 * (ones1 + ratt122)
        # Enhanced output of 2, based on memory of 1
        out211 = out2 * (ones1 + ratt211)

        out12 = FFx.relu(self.collect(FF.concatenate([out122, out211],
                                                     axis=1)))

        # Emphasize residual output from memory on input
        out_res = (input + out12) * (ones2 + att)
        return out_res
Ejemplo n.º 5
0
    def forward(self, length_predictions, labels):
        """
        Returns MSE loss.

        :param length_predictions: Length predictions. Shape: (batch_size,).
        :param labels: Targets. Shape: (batch_size,).
        :return: MSE loss of length predictions of the batch.
        """
        # (batch_size,)
        loss = (self.weight / 2) * np.square(length_predictions - labels)
        # (1,)
        loss = np.sum(loss)
        num_samples = np.sum(np.ones_like(length_predictions))
        return loss, num_samples
Ejemplo n.º 6
0
    def forward(self, length_predictions, labels):
        """
        Returns Poisson loss and output given data and expected integers as labels.

        :param length_predictions: Length predictions. Shape: (batch_size,).
        :param labels: Targets. Shape: (batch_size,).
        :return: Poisson loss of length predictions of the batch, and number of samples (batch size).
        """
        # (batch_size,)
        loss = length_predictions - labels * np.log(np.maximum(1e-10, length_predictions))
        # (1,)
        loss = np.sum(loss * self.weight)
        num_samples = np.sum(np.ones_like(length_predictions))
        return loss, num_samples
Ejemplo n.º 7
0
    def forward(self, input):

        # =========== UNet branch ===========
        out10 = self.conv_init_1(input)
        out1 = self.compr11(out10)
        out1 = FFx.relu(out1)
        #print (out1.shape)
        out1 = self.compr12(out1)
        out1 = FFx.relu(out1)
        #print (out1.shape)
        out1 = self.expand1(out1, out10)
        out1 = FFx.relu(out1)

        # =========== \capNet branch ===========
        out20 = self.conv_init_2(input)
        out2 = self.expand2(out20)
        out2 = FFx.relu(out2)
        out2 = self.compr21(out2)
        out2 = FFx.relu(out2)
        out2 = self.compr22(out2, out20)

        att = self.gamma1.data() * self.att(input)
        ratt122 = self.gamma2.data() * self.ratt122(out1, out2, out2)
        ratt211 = self.gamma3.data() * self.ratt211(out2, out1, out1)

        ones1 = FF.ones_like(out10)
        ones2 = FF.ones_like(input)

        # Enhanced output of 1, based on memory of 2
        out122 = out1 * (ones1 + ratt122)
        # Enhanced output of 2, based on memory of 1
        out211 = out2 * (ones1 + ratt211)

        out12 = self.collect(out122, out211)  # includes relu, it's for fusion

        out_res = (input + out12) * (ones2 + att)
        return out_res
Ejemplo n.º 8
0
 def test_setitem_autograd(np_array, index):
     """
     np_array: native numpy array.
     """
     x = np.array(np_array, dtype=np_array.dtype)
     out_shape = x[index].shape
     y = np.array(_np.random.uniform(size=out_shape))
     y.attach_grad()
     try:
         with mx.autograd.record():
             x[index] = y
             x.backward()
             y_grad = np.ones_like(y)
             assert same(y_grad.asnumpy(), y.grad.asnumpy())
     except mx.base.MXNetError as err:
         assert str(err).find('Inplace operations (+=, -=, x[:]=, etc) are not supported when recording with') != -1
Ejemplo n.º 9
0
    def forward(self, input_t1, input_t2):
        # These inputs must have the same dimensionality , t1, t2
        relatt12 = self.gamma1.data() * self.relatt12(input_t1, input_t2,
                                                      input_t2)
        relatt21 = self.gamma2.data() * self.relatt21(input_t2, input_t1,
                                                      input_t1)

        ones = FF.ones_like(input_t1)

        # Enhanced output of 1, based on memory of 2
        out12 = input_t1 * (ones + relatt12)
        # Enhanced output of 2, based on memory of 1
        out21 = input_t2 * (ones + relatt21)

        fuse = self.fuse(FF.concatenate([out12, out21], axis=1))
        fuse = FFx.relu(fuse)

        return fuse
Ejemplo n.º 10
0
def gen_self_attn_mask(data,
                       valid_length=None,
                       dtype: type = np.float32,
                       attn_type: str = 'full',
                       layout: str = 'NT'):
    """Generate the mask used for the encoder, i.e, self-attention.

    In our implementation, 1 --> not masked, 0 --> masked

    Let's consider the data with two samples:

    .. code-block:: none

        data =
            [['I',   'can', 'now',   'use', 'numpy', 'in',  'Gluon@@', 'NLP'  ],
             ['May', 'the', 'force', 'be',  'with',  'you', '<PAD>',   '<PAD>']]
        valid_length =
            [8, 6]

    - attn_type = 'causal'
        Each token will attend to itself + the tokens before.
        It will not attend to tokens in the future.

        For our example, the mask of the first sample is

        .. code-block:: none

                       ['I', 'can', 'now', 'use', 'numpy', 'in', 'Gluon@@', 'NLP']
            'I':         1,    0,     0,     0,      0,     0,      0,      0
            'can':       1,    1,     0,     0,      0,     0,      0,      0
            'now':       1,    1,     1,     0,      0,     0,      0,      0
            'use':       1,    1,     1,     1,      0,     0,      0,      0
            'numpy':     1,    1,     1,     1,      1,     0,      0,      0
            'in':        1,    1,     1,     1,      1,     1,      0,      0
            'Gluon@@':   1,    1,     1,     1,      1,     1,      1,      0
            'NLP':       1,    1,     1,     1,      1,     1,      1,      1

        The mask of the second sample is

        .. code-block:: none

                       ['May', 'the', 'force', 'be', 'with', 'you', '<PAD>', '<PAD>']
            'May':        1,    0,     0,     0,      0,     0,      0,      0
            'the':        1,    1,     0,     0,      0,     0,      0,      0
            'force':      1,    1,     1,     0,      0,     0,      0,      0
            'be':         1,    1,     1,     1,      0,     0,      0,      0
            'with':       1,    1,     1,     1,      1,     0,      0,      0
            'you':        1,    1,     1,     1,      1,     1,      0,      0
            '<PAD>':      0,    0,     0,     0,      0,     0,      0,      0
            '<PAD>':      0,    0,     0,     0,      0,     0,      0,      0


    - attn_type = 'full'
        Each token will attend to both the tokens before and in the future

        For our example, the mask of the first sample is

        .. code-block:: none

                       ['I', 'can', 'now', 'use', 'numpy', 'in', 'Gluon@@', 'NLP']
            'I':         1,    1,     1,     1,      1,     1,      1,      1
            'can':       1,    1,     1,     1,      1,     1,      1,      1
            'now':       1,    1,     1,     1,      1,     1,      1,      1
            'use':       1,    1,     1,     1,      1,     1,      1,      1
            'numpy':     1,    1,     1,     1,      1,     1,      1,      1
            'in':        1,    1,     1,     1,      1,     1,      1,      1
            'Gluon@@':   1,    1,     1,     1,      1,     1,      1,      1
            'NLP':       1,    1,     1,     1,      1,     1,      1,      1

        The mask of the second sample is

        .. code-block:: none

                       ['May', 'the', 'force', 'be', 'with', 'you', '<PAD>', '<PAD>']
            'May':        1,    1,     1,     1,      1,     1,      0,      0
            'the':        1,    1,     1,     1,      1,     1,      0,      0
            'force':      1,    1,     1,     1,      1,     1,      0,      0
            'be':         1,    1,     1,     1,      1,     1,      0,      0
            'with':       1,    1,     1,     1,      1,     1,      0,      0
            'you':        1,    1,     1,     1,      1,     1,      0,      0
            '<PAD>':      0,    0,     0,     0,      0,     0,      0,      0
            '<PAD>':      0,    0,     0,     0,      0,     0,      0,      0

    Parameters
    ----------
    data
        The data.

        - layout = 'NT'
            Shape (batch_size, seq_length, C)
        - layout = 'TN'
            Shape (seq_length, batch_size, C)

    valid_length
        Shape (batch_size,)
    dtype
        Data type of the mask
    attn_type
        Can be 'full' or 'causal'
    layout
        The layout of the data

    Returns
    -------
    mask
        Shape (batch_size, seq_length, seq_length)
    """
    if layout == 'NT':
        batch_axis, time_axis = 0, 1
    elif layout == 'TN':
        batch_axis, time_axis = 1, 0
    else:
        raise NotImplementedError('Unsupported layout={}'.format(layout))
    if attn_type == 'full':
        if valid_length is not None:
            valid_length = valid_length.astype(dtype)
            steps = npx.arange_like(data, axis=time_axis)  # (seq_length,)
            mask1 = (npx.reshape(steps, (1, 1, -1))
                     < npx.reshape(valid_length, (-2, 1, 1)))
            mask2 = (npx.reshape(steps, (1, -1, 1))
                     < npx.reshape(valid_length, (-2, 1, 1)))
            mask = mask1 * mask2
        else:
            # TODO(sxjscience) optimize
            seq_len_ones = np.ones_like(npx.arange_like(data, axis=time_axis))  # (seq_length,)
            batch_ones = np.ones_like(npx.arange_like(data, axis=batch_axis))   # (batch_size,)
            mask = batch_ones.reshape((-1, 1, 1)) * seq_len_ones.reshape((1, -1, 1))\
                   * seq_len_ones.reshape((1, 1, -1))
    elif attn_type == 'causal':
        steps = npx.arange_like(data, axis=time_axis)
        # mask: (seq_length, seq_length)
        # batch_mask: (batch_size, seq_length)
        mask = (np.expand_dims(steps, axis=0) <= np.expand_dims(steps, axis=1)).astype(dtype)
        if valid_length is not None:
            valid_length = valid_length.astype(dtype)
            batch_mask = (np.expand_dims(steps, axis=0) < np.expand_dims(valid_length, axis=-1)).astype(dtype)
            mask = mask * np.expand_dims(batch_mask, axis=-1)
        else:
            batch_ones = np.ones_like(npx.arange_like(data, axis=batch_axis),
                                        dtype=dtype)  # (batch_size,)
            mask = mask * batch_ones.reshape((-1, 1, 1))
    else:
        raise NotImplementedError
    return mask.astype(np.bool)
Ejemplo n.º 11
0
def gen_mem_attn_mask(mem, mem_valid_length, data, data_valid_length=None,
                      dtype=np.float32, layout: str = 'NT'):
    """Generate the mask used for the decoder. All query slots are attended to the memory slots.

    In our implementation, 1 --> not masked, 0 --> masked

    Let's consider the data + mem with a batch of two samples:

    .. code-block:: none

        mem = [['I',   'can', 'now',   'use'],
               ['May', 'the', 'force', '<PAD>']]
        mem_valid_length =
            [4, 3]
        data =
            [['numpy', 'in',    'Gluon@@', 'NLP'  ],
             ['be',    'with',  'you',     '<PAD>']]
        data_valid_length =
            [4, 3]

    For our example, the mask of the first sample is

    .. code-block:: none

                   ['I', 'can', 'now', 'use']
        'numpy':     1,    1,     1,     1
        'in':        1,    1,     1,     1
        'Gluon@@':   1,    1,     1,     1
        'NLP':       1,    1,     1,     1

    The mask of the second sample is

    .. code-block:: none

                   ['be', 'with', 'you', '<PAD>']
        'May':        1,    1,     1,     0
        'the':        1,    1,     1,     0
        'force':      1,    1,     1,     0
        '<PAD>':      0,    0,     0,     0


    Parameters
    ----------
    mem
       - layout = 'NT'
            Shape (batch_size, mem_length, C_mem)
       - layout = 'TN'
            Shape (mem_length, batch_size, C_mem)

    mem_valid_length :
        Shape (batch_size,)
    data
        - layout = 'NT'
            Shape (batch_size, query_length, C_data)
        - layout = 'TN'
            Shape (query_length, batch_size, C_data)

    data_valid_length :
        Shape (batch_size,)
    dtype
        Data type of the mask
    layout
        Layout of the data + mem tensor

    Returns
    -------
    mask :
        Shape (batch_size, query_length, mem_length)
    """
    if layout == 'NT':
        batch_axis, time_axis = 0, 1
    elif layout == 'TN':
        batch_axis, time_axis = 1, 0
    else:
        raise NotImplementedError('Unsupported layout={}'.format(layout))
    mem_valid_length = mem_valid_length.astype(dtype)
    mem_steps = npx.arange_like(mem, axis=time_axis)  # (mem_length,)
    data_steps = npx.arange_like(data, axis=time_axis)  # (query_length,)
    mem_mask = (npx.reshape(mem_steps, (1, 1, -1))
                < npx.reshape(mem_valid_length, (-2, 1, 1))).astype(dtype)  # (B, 1, mem_length)
    if data_valid_length is not None:
        data_valid_length = data_valid_length.astype(dtype)
        data_mask = (npx.reshape(data_steps, (1, -1, 1))
                     < npx.reshape(data_valid_length, (-2, 1, 1))).astype(dtype)  # (B, query_length, 1)
        mask = mem_mask * data_mask
    else:
        query_length_ones = np.ones_like(data_steps)
        mask = query_length_ones.reshape((1, -1, 1)) * mem_mask
    return mask.astype(np.bool)
Ejemplo n.º 12
0
    def get_corrupted_tokens(self, inputs, original_tokens, masked_positions, logits):
        """
        Sample from the generator to create corrupted input.

        Parameters
        ----------
        F
        inputs
            The masked input
            - layout = 'NT'
                Shape (batch_size, seq_length)
            - layout = 'TN'
                Shape (seq_length, batch_size)
        original_tokens
            The original tokens that appear in the unmasked input sequence
            Shape (batch_size, num_masked_positions).
        masked_positions
            The masked position of the sequence
            Shape (batch_size, num_masked_positions).
        logits
            The logits of each tokens
            Shape (batch_size, num_masked_positions, vocab_size)

        Returns
        -------
        corrupted_tokens
            Shape (batch_size, )
        fake_data
            - layout = 'NT'
                Shape (batch_size, seq_length)
            - layout = 'TN'
                Shape (seq_length, batch_size)
        labels
            - layout = 'NT'
                Shape (batch_size, seq_length)
            - layout = 'TN'
                Shape (seq_length, batch_size)
        """

        if self._disallow_correct:
            # TODO(sxjscience), Revise the implementation
            disallow = npx.one_hot(masked_positions, depth=self.vocab_size, dtype=self._dtype)
            logits = logits - 1000.0 * disallow
        # gumbel_softmax() samples from the logits with a noise of Gumbel distribution
        prob = gumbel_softmax(
            F,
            logits,
            temperature=self._temperature,
            eps=self._gumbel_eps,
            use_np_gumbel=False)
        corrupted_tokens = np.argmax(prob, axis=-1).astype(np.int32)

        if self.disc_backbone.layout == 'TN':
            inputs = inputs.T
        original_data = update_vectors_by_position(F,
            inputs, original_tokens, masked_positions)
        fake_data = update_vectors_by_position(F,
            inputs, corrupted_tokens, masked_positions)
        updates_mask = add_vectors_by_position(np.zeros_like(inputs),
                np.ones_like(masked_positions), masked_positions)
        # Dealing with multiple zeros in masked_positions which
        # results in a non-zero value in the first index [CLS]
        updates_mask = np.minimum(updates_mask, 1)
        labels = updates_mask * np.not_equal(fake_data, original_data)
        if self.disc_backbone.layout == 'TN':
            return corrupted_tokens, fake_data.T, labels.T
        else:
            return corrupted_tokens, fake_data, labels
 def forward(self, pred, label, valid_len):
     # weights shape: (batch_size, seq_len, 1)
     weights = np.expand_dims(np.ones_like(label), axis=-1)
     weights = npx.sequence_mask(weights, valid_len, True, axis=1)
     return super(MaskedSoftmaxCELoss, self).forward(pred, label, weights)
Ejemplo n.º 14
0
    def dynamic_masking(self, input_ids, valid_lengths):
        # TODO(zheyuye), two additional flag `disallow_from_mask` and `already_masked`
        # that control the masking status for each positions in the sequence.
        """
        Generate masking positions on-the-fly instead of during preprocessing
        Parameters
        ----------
        input_ids
            The batchified input_ids with shape (batch_size, max_seq_length)
        valid_lengths
            The batchified valid_lengths with shape (batch_size, )
        Returns
        ------
        masked_input_ids
            The masked input sequence with 15% tokens are masked with [MASK]
            shape (batch_size, max_seq_length)
        length_masks
            The masking matrix for the whole sequence that indicates the positions
            are greater than valid_length.

            shape (batch_size, max_seq_length)
        unmasked_tokens
            The original tokens that appear in the unmasked input sequence
            shape (batch_size, num_masked_positions)
        masked_positions
            The masking positions in mx.np.ndarray with shape (batch_size, num_masked_positions)
            shape (batch_size, num_masked_positions)
        masked_lm_weights
            The weight matrix containing 0 or 1 to mark the actual effect of masked positions
            shape (batch_size, num_masked_positions)
        """
        N = self._max_num_masked_position
        # Only valid token without special token are allowed to mask
        valid_candidates = np.ones_like(input_ids, dtype=np.bool)
        ignore_tokens = [
            self.vocab.cls_id, self.vocab.sep_id, self.vocab.pad_id
        ]

        for ignore_token in ignore_tokens:
            # TODO(zheyuye), Update when operation += supported
            valid_candidates = valid_candidates * \
                np.not_equal(input_ids, ignore_token)
        valid_lengths = valid_lengths.astype(np.float32)
        valid_candidates = valid_candidates.astype(np.float32)
        num_masked_position = mxnp.maximum(
            1, np.minimum(N, round(valid_lengths * self._mask_prob)))

        # Get the masking probability of each position
        sample_probs = self._proposal_distribution * valid_candidates
        sample_probs /= mxnp.sum(sample_probs, axis=-1, keepdims=True)
        sample_probs = npx.stop_gradient(sample_probs)
        gumbels = mxnp.random.gumbel(np.zeros_like(sample_probs))
        # Following the instruction of official repo to avoid deduplicate postions
        # with Top_k Sampling as https://github.com/google-research/electra/issues/41
        masked_positions = npx.topk(mxnp.log(sample_probs) + gumbels,
                                    k=N,
                                    axis=-1,
                                    ret_typ='indices',
                                    dtype=np.int32)

        masked_weights = npx.sequence_mask(mxnp.ones_like(masked_positions),
                                           sequence_length=num_masked_position,
                                           use_sequence_length=True,
                                           axis=1,
                                           value=0)
        masked_positions = masked_positions * masked_weights
        length_masks = npx.sequence_mask(mxnp.ones_like(input_ids,
                                                        dtype=np.float32),
                                         sequence_length=valid_lengths,
                                         use_sequence_length=True,
                                         axis=1,
                                         value=0)
        unmasked_tokens = select_vectors_by_position(
            input_ids, masked_positions) * masked_weights
        masked_weights = masked_weights.astype(np.float32)
        replaced_positions = (mxnp.random.uniform(
            mxnp.zeros_like(masked_positions), mxnp.ones_like(
                masked_positions)) < self._replace_prob) * masked_positions
        # dealing with multiple zero values in replaced_positions which causes
        # the [CLS] being replaced
        filled = mxnp.where(replaced_positions, self.vocab.mask_id,
                            self.vocab.cls_id).astype(np.int32)
        # Masking token by replacing with [MASK]
        masked_input_ids = update_vectors_by_position(input_ids, filled,
                                                      replaced_positions)

        # Note: It is likely have multiple zero values in masked_positions if number of masked of
        # positions not reached the maximum. However, this example hardly exists since valid_length
        # is almost always equal to max_seq_length
        masked_input = self.MaskedInput(input_ids=masked_input_ids,
                                        masks=length_masks,
                                        unmasked_tokens=unmasked_tokens,
                                        masked_positions=masked_positions,
                                        masked_weights=masked_weights)
        return masked_input
Ejemplo n.º 15
0
def test_ones_like():
    inp = np.ones((2, INT_OVERFLOW))
    out = np.ones_like(inp)
    assert out.shape == inp.shape
    assert out[0, 0] == 1 and out[-1, -1] == 1