Esempio n. 1
0
    def forward(self, scores, target_dists, finished, best_hyp_indices):
        """
        Choose an extension of each hypothesis from its softmax distribution.

        :param scores: Vocabulary scores for the next beam step. (batch_size * beam_size, target_vocabulary_size)
        :param target_dists: The non-cumulative target distributions (ignored).
        :param finished: The list of finished hypotheses.
        :param best_hyp_indices: Best hypothesis indices constant.
        :return: The row indices, column indices, and values of the sampled words.
        """
        # Map the negative logprobs to probabilities so as to have a distribution
        target_dists = np.exp(-target_dists)

        # n == 0 means sample from the full vocabulary. Otherwise, we sample from the top n.
        if self.n != 0:
            # select the top n in each row, via a mask
            masked_items = npx.topk(target_dists, k=self.n, ret_typ='mask', axis=1, is_ascend=False)
            # set unmasked items to 0
            masked_items = np.where(masked_items, target_dists, masked_items)
            # renormalize
            target_dists = masked_items / np.sum(masked_items, axis=1, keepdims=True)

        # Sample from the target distributions over words, then get the corresponding values from the cumulative scores
        best_word_indices = npx.random.categorical(target_dists, get_prob=False)
        # Zeroes for finished hypotheses.
        best_word_indices = np.where(finished, np.zeros_like(best_word_indices), best_word_indices)
        values = npx.pick(scores, best_word_indices, axis=1, keepdims=True)

        best_hyp_indices = npx.slice_like(best_hyp_indices, best_word_indices, axes=(0,))

        return best_hyp_indices, best_word_indices, values
Esempio n. 2
0
    def forward(self, target_dists, finished, inactive,
                      scores_accumulated, lengths, max_lengths,
                      unk_dist, pad_dist, eos_dist):
        # make sure to avoid generating <unk> if unk_dist is specified
        if unk_dist is not None:
            target_dists = target_dists + unk_dist
        # broadcast hypothesis score to each prediction.
        # scores_accumulated. Shape: (batch*beam, 1)
        # target_dists. Shape: (batch*beam, vocab_size)
        scores = target_dists + scores_accumulated

        # Special treatment for finished and inactive rows. Inactive rows are inf everywhere;
        # finished rows are inf everywhere except column zero (pad_id), which holds the accumulated model score.
        # Items that are finished (but not inactive) get their previous accumulated score for the <pad> symbol,
        # infinity otherwise.
        # pad_dist. Shape: (batch*beam, vocab_size)
        pad_dist = np.concatenate((scores_accumulated, pad_dist), axis=1)
        scores = np.where(np.logical_or(finished, inactive), pad_dist, scores)

        # Update lengths of all items, except those that were already finished. This updates
        # the lengths for inactive items, too, but that doesn't matter since they are ignored anyway.
        lengths = lengths + (1 - finished)

        # Items that are at their maximum length and not finished now are forced to produce the <eos> symbol.
        # That is, we keep scores for hypotheses below max length or finished, and 'force-eos' the rest.
        below_max_length = lengths < max_lengths
        scores = np.where(np.logical_or(below_max_length, finished), scores, eos_dist + scores)

        return scores, lengths
def plot_loss_wrong_preds(history, x=None, y=None, yhat=None, labels=None):
    #Plot the train loss and val loss
    plt.plot(history[0], label='train_err')
    plt.plot(history[1], label='val_err')
    plt.title('Error trend')
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

    #Show some misclassified samples
    if (labels):
        #get some wrong predictions
        mis_idx = np.where(y != yhat)[0]
        #get some correct predictions
        idx = np.where(y == yhat)[0]
        wrong_preds = np.random.choice(mis_idx, size=min(len(mis_idx), 8))
        correct_preds = np.random.choice(idx, size=min(len(idx), 8))
        ax = []
        fig = plt.figure(figsize=(12, 12))
        columns = 4
        rows = 4

        for i, j in enumerate(correct_preds):
            j = j.item()
            if (type(x) is np.ndarray):
                img = x[j]
            else:
                img = plt.imread(x[j])
            ax.append(fig.add_subplot(rows, columns, i + 1))
            ax[-1].set_title(
                f'true: {labels[y[j].item()]}, pred: {labels[yhat[j].item()]}',
                color='g')
            plt.imshow(img)
        for i, j in enumerate(wrong_preds):
            j = j.item()
            if (type(x) is np.ndarray):
                img = x[j]
            else:
                img = plt.imread(x[j])
            ax.append(fig.add_subplot(rows, columns, i + 9))
            ax[-1].set_title(
                f'true: {labels[y[j].item()]}, pred: {labels[yhat[j].item()]}',
                color='r')
            plt.imshow(img)
        plt.tight_layout(pad=1.2)
        fig.suptitle('Sample Correct(8) & Wrong predictions(8)', y=0.001)
        plt.show()
Esempio n. 4
0
def masked_logsoftmax(att_score, mask, axis: int = -1):
    """Ignore the masked elements when calculating the softmax. The mask can be broadcastable.

    Parameters
    ----------
    att_score : Symborl or NDArray
        Shape (..., length, ...)
    mask : Symbol or NDArray or None
        Shape (..., length, ...)
        mask = 1 --> not masked
        mask = 0 --> masked
    axis
        The axis to calculate the softmax. att_score.shape[axis] must be the same as mask.shape[axis]

    Returns
    -------
    logits : Symborl or NDArray
        Shape (..., length, ...)
        The masked values will be all zero
    """
    if mask is None:
        return npx.log_softmax(att_score, axis=axis)
    else:
        mask = mask.astype(np.bool)
        return np.where(mask, npx.masked_log_softmax(att_score, mask, axis=axis), -np.inf)
Esempio n. 5
0
    def forward(self, logits, labels, length_ratio, source_length,
                target_length):
        """
        :param logits: Model logits. Shape: (batch, length, vocab_size).
        :param labels: Gold targets. Shape: (batch, length).
        :param length_ratio: Length Ratios. Shape: (batch,).
        :param source_length: Source lengths. Shape: (batch,).
        :param target_length: Target lengths. Shape: (batch,).
        :return: Sequence scores. Shape: (batch,).
        """
        logprobs = npx.log_softmax(logits,
                                   axis=-1,
                                   temperature=self.softmax_temperature)

        # Select the label probability, then take their logs.
        # probs and scores: (batch_size, target_seq_len)
        token_scores = npx.pick(logprobs, labels, axis=-1)
        if self.score_type == C.SCORING_TYPE_NEGLOGPROB:
            token_scores = token_scores * -1

        # Sum, then apply length penalty. The call to `np.where` masks out invalid values from scores.
        # zeros and sums: (batch_size,)
        scores = np.sum(np.where(labels != 0, token_scores,
                                 np.zeros_like(token_scores)),
                        axis=1)

        if self.constant_length_ratio is not None and self.constant_length_ratio > 0.0:
            predicted_output_length = source_length * self.constant_length_ratio
        else:
            predicted_output_length = source_length * length_ratio

        scores = self.scorer(scores, target_length, predicted_output_length)

        return scores
Esempio n. 6
0
    def forward(self, best_hyp_indices, best_word_indices,
                finished, scores_accumulated, lengths, reference_lengths,
                factors=None):

        # Reorder fixed-size beam data according to best_hyp_indices (ascending)
        finished = np.take(finished, best_hyp_indices, axis=0)
        lengths = np.take(lengths, best_hyp_indices, axis=0)
        reference_lengths = np.take(reference_lengths, best_hyp_indices, axis=0)

        # Normalize hypotheses that JUST finished
        all_finished = np.expand_dims(np.logical_or(best_word_indices == self.pad_id,
                                                    best_word_indices == self.eos_id),
                                      axis=1)
        newly_finished = np.logical_xor(all_finished, finished)

        scores_accumulated = np.where(newly_finished,
                                      self._scorer(scores_accumulated,
                                                   npx.cast(lengths, self.dtype),
                                                   reference_lengths),
                                      scores_accumulated)

        # Recompute finished. Hypotheses are finished if they are extended with <pad> or <eos>
        finished = np.logical_or(best_word_indices == self.pad_id, best_word_indices == self.eos_id)
        finished = npx.cast(np.expand_dims(finished, axis=1), 'int32')

        # Concatenate sorted secondary target factors to best_word_indices. Shape: (batch*beam, num_factors)
        best_word_indices = np.expand_dims(best_word_indices, axis=1)

        if factors is not None:
            secondary_factors = np.take(factors, best_hyp_indices, axis=0)
            best_word_indices = np.concatenate((best_word_indices, secondary_factors), axis=1)

        return best_word_indices, finished, scores_accumulated, lengths, reference_lengths
def print_model_metrics(y,
                        yhat,
                        labels,
                        title,
                        stream=sys.stdout,
                        wrong_preds=False):
    #Check if y is one hot encoded
    if (len(y.shape) != 1):
        y = y.argmax(axis=1)
        yhat = yhat.argmax(axis=1)

    print('\n' + title + '\n------------\n', file=stream)

    print("Classification Metrics\n----------\n", file=stream)
    print(met.classification_report(y,
                                    yhat,
                                    target_names=labels,
                                    zero_division=1),
          file=stream)

    print("Confusion Matrix\n----------\n", file=stream)
    print(met.confusion_matrix(y, yhat), file=stream)

    if (wrong_preds):
        print("Wrong Predictions\n----------\n", file=stream)
        mis_idx = np.where(y != yhat)[0]
        size = min(len(mis_idx), 10)
        wrong_preds = np.random.choice(mis_idx, size=size)
        for i in range(size):
            #print(df_test.iloc[wrong_preds[i]], file=stream)
            print('Original Label : {0}'.format(y[wrong_preds[i]]),
                  file=stream)
            print('Predicted Label : {0}'.format(yhat[wrong_preds[i]]),
                  file=stream)
            print('********************', file=stream)
Esempio n. 8
0
def relative_position_bucket(relative_position,
                             bidirectional: bool = True,
                             num_buckets: int = 32,
                             max_distance: int = 128):
    """Map the relative position to buckets. The implementation is consistent with that
    in [mesh_tensorflow](https://github.com/tensorflow/mesh/blob/c59988047e49b4d2af05603e3170724cdbadc467/mesh_tensorflow/transformer/transformer_layers.py#L595-L637)
    where relative position is defined as `mem_i - query_j`. Thus, a positive value indicates 
    that the memory slot is in a later timestamp than the query slot. 

    After handling the bidirectional case (see below), the implementation uses the first half 
    of buckets to store exact differences and the second half to store the differences after 
    a logrithmic transformation. 

    Parameters
    ----------
    relative_position
        Shape (...,)
    bidirectional
        Whether we are dealing with bidirectional attention.
        If it's bidirectional, positive shifts are mappd to [0, num_buckets // 2), 
        and negative shifts are mapped to [num_buckets // 2, num_buckets). 
    num_buckets
        The number of buckets.
    max_distance
        Maximum distance. Positions that fall outside of 'max_distance' will be trimmed.

    Returns
    -------
    buckets
        Shape (...,).
        It has the same shape as the `relative_position`. It will have int32 type.
    """
    ret = 0
    relative_position = -relative_position
    if bidirectional:
        assert num_buckets % 2 == 0, 'When bidirectional is True, the number of buckets must be ' \
                                     'divisible by 2.'
        num_buckets //= 2
        ret = ret + (relative_position < 0).astype(np.int32) * num_buckets
        relative_position = np.abs(relative_position)
    else:
        # Clip all the negative values to 0
        relative_position = np.clip(relative_position, a_min=0, a_max=None)
    # Now, the relative_position is in the range [0, inf)

    # Half of the buckets deal with the exact increments,
    # i.e., 0, 1, 2, ..., max_exact - 1, where max_exact = num_buckets // 2
    max_exact = num_buckets // 2
    is_small = relative_position < max_exact

    # The other half of the buckets are for logarithmically bigger bins in positions up to
    # max_distance
    val_if_large = max_exact + (
        np.log(relative_position.astype(np.float32) / max_exact) /
        math.log(max_distance / max_exact) *
        (num_buckets - max_exact)).astype(np.int32)
    val_if_large = np.minimum(val_if_large, num_buckets - 1)
    ret = ret + np.where(is_small, relative_position, val_if_large)
    return ret
Esempio n. 9
0
def masked_logsoftmax(att_score, mask, dtype=np.float32, axis: int = -1):
    """Ignore the masked elements when calculating the softmax. The mask can be broadcastable.

    Parameters
    ----------
    att_score : Symborl or NDArray
        Shape (..., length, ...)
    mask : Symbol or NDArray or None
        Shape (..., length, ...)
        mask = 1 --> not masked
        mask = 0 --> masked
    dtype
        data type
    axis
        The axis to calculate the softmax. att_score.shape[axis] must be the same as mask.shape[axis]
    Returns
    -------
    logits : Symborl or NDArray
        Shape (..., length, ...)
        The masked values will be all zero
    """
    if mask is not None:
        # Fill in the masked scores with a very small value
        neg = -1e18
        if _np.dtype(dtype) == np.float16:
            neg = -1e4
        else:
            try:
                # if AMP (automatic mixed precision) is enabled, -1e18 will cause NaN.
                from mxnet.contrib import amp
                if amp.amp._amp_initialized:
                    neg = -1e4
            except ImportError:
                pass
        att_score = np.where(mask, att_score, neg)
        logits = np.where(mask, npx.log_softmax(att_score, axis=axis), -np.inf)
    else:
        logits = npx.log_softmax(att_score, axis=axis)
    return logits
Esempio n. 10
0
def test_samplek_func(batch_size, beam_size, target_vocab_size, top_n):
    pytest.importorskip("mxnet")
    from mxnet import np
    import sockeye.beam_search

    # arrange scores increasing values from left to right, so the best item is always index 0, next-best 1, and so on
    scores = np.array([
        list(range(1, target_vocab_size + 1))
        for _ in range(batch_size * beam_size)
    ])
    # normalize
    target_dists = scores / scores.sum(axis=1, keepdims=True)

    samplek = sockeye.beam_search.SampleK(n=top_n)
    samplek.initialize()

    sample_best_hyp_indices = np.arange(0,
                                        batch_size * beam_size,
                                        dtype='int32')

    # 0..(batch_size * beam_size)-1
    expected_hyps = np.array(range(batch_size * beam_size), dtype='int32')
    finished = (np.random.uniform(0, 1, (batch_size * beam_size)) >
                0.5).astype('int32')

    for i in [1, 2]:
        if i == 2:
            samplek.hybridize()

        hyps, words, values = samplek(scores, scores, finished,
                                      sample_best_hyp_indices)
        assert hyps.shape[0] == batch_size * beam_size

        # The indices should always be the integers from 0 to batch*beam-1
        assert sum(hyps == expected_hyps).item() == (batch_size * beam_size)
        if top_n != 0:
            # Scores are increasing left-to-right, so best items are all the lowest word IDs.
            # No word id greater than the cap (top_n) should be selected
            assert np.sum(words >= top_n).item() == 0

        # word index should be zero for all finished hypotheses
        assert np.sum(np.where(finished, words, finished)).item() == 0
Esempio n. 11
0
    def forward(self, inp):  # pylint: disable=arguments-differ
        """

        Parameters
        ----------
        inp
            Shape (...,)

        Returns
        -------
        out
            Shape (..., units)
        """
        if self._div_val == 1.0:
            emb = np.take(getattr(self, 'embed0_weight').data(), inp, axis=0)
            if self._units != self._embed_size:
                emb = np.dot(emb, getattr(self, 'inter_proj0_weight').data())
        else:
            emb = None
            for i, (l_idx, r_idx) in enumerate(
                    zip([0] + self._cutoffs,
                        self._cutoffs + [self._vocab_size])):
                emb_i = np.take(getattr(self,
                                        'embed{}_weight'.format(i)).data(),
                                inp - l_idx,
                                axis=0,
                                mode='clip')
                emb_i = np.dot(
                    emb_i,
                    getattr(self, 'inter_proj{}_weight'.format(i)).data())
                if emb is None:
                    emb = emb_i
                else:
                    emb = np.where(
                        np.expand_dims((inp >= l_idx) * (inp < r_idx),
                                       axis=-1), emb_i, emb)
        if self._scaled:
            emb = emb * self._emb_scale
        return emb
Esempio n. 12
0
    def dynamic_masking(self, input_ids, valid_lengths):
        # TODO(zheyuye), two additional flag `disallow_from_mask` and `already_masked`
        # that control the masking status for each positions in the sequence.
        """
        Generate masking positions on-the-fly instead of during preprocessing
        Parameters
        ----------
        input_ids
            The batchified input_ids with shape (batch_size, max_seq_length)
        valid_lengths
            The batchified valid_lengths with shape (batch_size, )
        Returns
        ------
        masked_input_ids
            The masked input sequence with 15% tokens are masked with [MASK]
            shape (batch_size, max_seq_length)
        length_masks
            The masking matrix for the whole sequence that indicates the positions
            are greater than valid_length.

            shape (batch_size, max_seq_length)
        unmasked_tokens
            The original tokens that appear in the unmasked input sequence
            shape (batch_size, num_masked_positions)
        masked_positions
            The masking positions in mx.np.ndarray with shape (batch_size, num_masked_positions)
            shape (batch_size, num_masked_positions)
        masked_lm_weights
            The weight matrix containing 0 or 1 to mark the actual effect of masked positions
            shape (batch_size, num_masked_positions)
        """
        N = self._max_num_masked_position
        # Only valid token without special token are allowed to mask
        valid_candidates = np.ones_like(input_ids, dtype=np.bool)
        ignore_tokens = [
            self.vocab.cls_id, self.vocab.sep_id, self.vocab.pad_id
        ]

        for ignore_token in ignore_tokens:
            # TODO(zheyuye), Update when operation += supported
            valid_candidates = valid_candidates * \
                np.not_equal(input_ids, ignore_token)
        valid_lengths = valid_lengths.astype(np.float32)
        valid_candidates = valid_candidates.astype(np.float32)
        num_masked_position = mxnp.maximum(
            1, np.minimum(N, round(valid_lengths * self._mask_prob)))

        # Get the masking probability of each position
        sample_probs = self._proposal_distribution * valid_candidates
        sample_probs /= mxnp.sum(sample_probs, axis=-1, keepdims=True)
        sample_probs = npx.stop_gradient(sample_probs)
        gumbels = mxnp.random.gumbel(np.zeros_like(sample_probs))
        # Following the instruction of official repo to avoid deduplicate postions
        # with Top_k Sampling as https://github.com/google-research/electra/issues/41
        masked_positions = npx.topk(mxnp.log(sample_probs) + gumbels,
                                    k=N,
                                    axis=-1,
                                    ret_typ='indices',
                                    dtype=np.int32)

        masked_weights = npx.sequence_mask(mxnp.ones_like(masked_positions),
                                           sequence_length=num_masked_position,
                                           use_sequence_length=True,
                                           axis=1,
                                           value=0)
        masked_positions = masked_positions * masked_weights
        length_masks = npx.sequence_mask(mxnp.ones_like(input_ids,
                                                        dtype=np.float32),
                                         sequence_length=valid_lengths,
                                         use_sequence_length=True,
                                         axis=1,
                                         value=0)
        unmasked_tokens = select_vectors_by_position(
            input_ids, masked_positions) * masked_weights
        masked_weights = masked_weights.astype(np.float32)
        replaced_positions = (mxnp.random.uniform(
            mxnp.zeros_like(masked_positions), mxnp.ones_like(
                masked_positions)) < self._replace_prob) * masked_positions
        # dealing with multiple zero values in replaced_positions which causes
        # the [CLS] being replaced
        filled = mxnp.where(replaced_positions, self.vocab.mask_id,
                            self.vocab.cls_id).astype(np.int32)
        # Masking token by replacing with [MASK]
        masked_input_ids = update_vectors_by_position(input_ids, filled,
                                                      replaced_positions)

        # Note: It is likely have multiple zero values in masked_positions if number of masked of
        # positions not reached the maximum. However, this example hardly exists since valid_length
        # is almost always equal to max_seq_length
        masked_input = self.MaskedInput(input_ids=masked_input_ids,
                                        masks=length_masks,
                                        unmasked_tokens=unmasked_tokens,
                                        masked_positions=masked_positions,
                                        masked_weights=masked_weights)
        return masked_input