def forward(self, best_hyp_indices, best_word_indices, finished, scores_accumulated, lengths, reference_lengths, factors=None): # Reorder fixed-size beam data according to best_hyp_indices (ascending) finished = np.take(finished, best_hyp_indices, axis=0) lengths = np.take(lengths, best_hyp_indices, axis=0) reference_lengths = np.take(reference_lengths, best_hyp_indices, axis=0) # Normalize hypotheses that JUST finished all_finished = np.expand_dims(np.logical_or(best_word_indices == self.pad_id, best_word_indices == self.eos_id), axis=1) newly_finished = np.logical_xor(all_finished, finished) scores_accumulated = np.where(newly_finished, self._scorer(scores_accumulated, npx.cast(lengths, self.dtype), reference_lengths), scores_accumulated) # Recompute finished. Hypotheses are finished if they are extended with <pad> or <eos> finished = np.logical_or(best_word_indices == self.pad_id, best_word_indices == self.eos_id) finished = npx.cast(np.expand_dims(finished, axis=1), 'int32') # Concatenate sorted secondary target factors to best_word_indices. Shape: (batch*beam, num_factors) best_word_indices = np.expand_dims(best_word_indices, axis=1) if factors is not None: secondary_factors = np.take(factors, best_hyp_indices, axis=0) best_word_indices = np.concatenate((best_word_indices, secondary_factors), axis=1) return best_word_indices, finished, scores_accumulated, lengths, reference_lengths
def _func(*states): i = states[0] s = states[1: ] data = np.squeeze(np.take(inputs, i), axis=0) out, new_s = self.cell(data, s) new_s = [i + 1] + new_s return out, new_s
def forward(self, best_hyp_indices, *states): sorted_states = [] assert len(states) == len(self.flat_structure), "Number of states do not match the defined state structure" for state, state_format in zip(states, self.flat_structure): if state_format == C.STEP_STATE: # Steps and source_bias have batch dimension on axis 0 sorted_state = np.take(state, best_hyp_indices, axis=0) elif state_format == C.DECODER_STATE: # Decoder and encoder layer states have batch dimension on axis 1 sorted_state = np.take(state, best_hyp_indices, axis=1) elif state_format == C.ENCODER_STATE or state_format == C.MASK_STATE: # No need for takes on encoder layer states sorted_state = state else: raise ValueError("Provided state format %s not recognized." % state_format) sorted_states.append(sorted_state) return sorted_states
def forward(self, inp): # pylint: disable=arguments-differ """ Parameters ---------- inp Shape (...,) Returns ------- out Shape (..., units) """ if self._div_val == 1.0: emb = np.take(getattr(self, 'embed0_weight').data(), inp, axis=0) if self._units != self._embed_size: emb = np.dot(emb, getattr(self, 'inter_proj0_weight').data()) else: emb = None for i, (l_idx, r_idx) in enumerate( zip([0] + self._cutoffs, self._cutoffs + [self._vocab_size])): emb_i = np.take(getattr(self, 'embed{}_weight'.format(i)).data(), inp - l_idx, axis=0, mode='clip') emb_i = np.dot( emb_i, getattr(self, 'inter_proj{}_weight'.format(i)).data()) if emb is None: emb = emb_i else: emb = np.where( np.expand_dims((inp >= l_idx) * (inp < r_idx), axis=-1), emb_i, emb) if self._scaled: emb = emb * self._emb_scale return emb
def forward(self, scores: np.ndarray, vocab_slice_ids: Optional[np.ndarray] = None, target_factors: Optional[np.ndarray] = None) -> np.ndarray: # shape: (batch*beam=1, 1) # argmin has trouble with fp16 inputs on GPUs, using top1 instead best_word_index = npx.topk(scores, axis=-1, k=1, ret_typ='indices', is_ascend=True, dtype='int32') # Map from restricted to full vocab ids if needed if vocab_slice_ids is not None: best_word_index = np.take(vocab_slice_ids, best_word_index, axis=0) if target_factors is not None: best_word_index = np.concatenate((best_word_index, target_factors), axis=1) return best_word_index
def forward(self, relative_positions): buckets = relative_position_bucket(relative_positions, bidirectional=self._bidirectional, num_buckets=self._num_buckets, max_distance=self._max_distance) return np.take(self.weight.data(), buckets, axis=0)
def forward(self, positions): return np.take(self.weight.data(), positions, axis=0, mode=self._mode)
def forward(self, source: np.ndarray, source_length: np.ndarray, restrict_lexicon: Optional[lexicon.TopKLexicon], raw_constraint_list: List[Optional[constrained.RawConstraintList]], raw_avoid_list: List[Optional[constrained.RawConstraintList]], max_output_lengths: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, List[Optional[np.ndarray]], List[Optional[constrained.ConstrainedHypothesis]]]: """ Translates multiple sentences using beam search. :param source: Source ids. Shape: (batch_size, bucket_key, num_factors). :param source_length: Valid source lengths. Shape: (batch_size,). :param restrict_lexicon: Lexicon to use for vocabulary restriction. :param raw_constraint_list: A list of optional lists containing phrases (as lists of target word IDs) that must appear in each output. :param raw_avoid_list: A list of optional lists containing phrases (as lists of target word IDs) that must NOT appear in each output. :param max_output_lengths: ndarray of maximum output lengths per input in source. Shape: (batch_size,). Dtype: int32. :return List of best hypotheses indices, list of best word indices, array of accumulated length-normalized negative log-probs, hypotheses lengths, predicted lengths of references (if any), constraints (if any). """ batch_size = source.shape[0] logger.debug("beam_search batch size: %d", batch_size) # Maximum beam search iterations (determined by longest input with eos) max_iterations = max_output_lengths.max().item() logger.debug("max beam search iterations: %d", max_iterations) sample_best_hyp_indices = None if self._sample is not None: utils.check_condition(restrict_lexicon is None, "Sampling is not available when working with a restricted lexicon.") sample_best_hyp_indices = np.arange(0, batch_size * self.beam_size, dtype='int32', ctx=self.context) # General data structure: batch_size * beam_size blocks in total; # a full beam for each sentence, followed by the next beam-block for the next sentence and so on # best word_indices (also act as input: (batch*beam, num_target_factors best_word_indices = np.full((batch_size * self.beam_size, self.num_target_factors), fill_value=self.bos_id, ctx=self.context, dtype='int32') # offset for hypothesis indices in batch decoding offset = np.repeat(np.arange(0, batch_size * self.beam_size, self.beam_size, dtype='int32', ctx=self.context), self.beam_size) # locations of each batch item when first dimension is (batch * beam) batch_indices = np.arange(0, batch_size * self.beam_size, self.beam_size, dtype='int32', ctx=self.context) first_step_mask = np.full((batch_size * self.beam_size, 1), fill_value=np.inf, ctx=self.context, dtype=self.dtype) first_step_mask[batch_indices] = 0.0 # Best word and hypotheses indices across beam search steps from topk operation. best_hyp_indices_list = [] # type: List[np.ndarray] best_word_indices_list = [] # type: List[np.ndarray] lengths = np.zeros((batch_size * self.beam_size, 1), ctx=self.context, dtype='int32') finished = np.zeros((batch_size * self.beam_size, 1), ctx=self.context, dtype='int32') # Extending max_output_lengths to shape (batch_size * beam_size, 1) max_output_lengths = np.repeat(np.expand_dims(max_output_lengths, axis=1), self.beam_size, axis=0) # scores_accumulated: chosen smallest scores in scores (ascending). scores_accumulated = np.zeros((batch_size * self.beam_size, 1), ctx=self.context, dtype=self.dtype) output_vocab_size = self.output_vocab_size # If using a top-k lexicon, select param rows for logit computation that correspond to the # target vocab for this sentence. vocab_slice_ids = None # type: Optional[np.ndarrays] if restrict_lexicon: source_words = np.squeeze(np.split(source, self.num_source_factors, axis=2)[0], axis=2) vocab_slice_ids, output_vocab_size, raw_constraint_list = _get_vocab_slice_ids(restrict_lexicon, source_words, raw_constraint_list, self.eos_id, beam_size=1) pad_dist = np.full((batch_size * self.beam_size, output_vocab_size - 1), fill_value=np.inf, ctx=self.context, dtype=self.dtype) eos_dist = np.full((batch_size * self.beam_size, output_vocab_size), fill_value=np.inf, ctx=self.context, dtype=self.dtype) eos_dist[:, C.EOS_ID] = 0 unk_dist = None if self.prevent_unk: unk_dist = np.zeros_like(eos_dist) unk_dist[:, C.UNK_ID] = np.inf # pylint: disable=E1137 # Initialize the beam to track constraint sets, where target-side lexical constraints are present constraints = constrained.init_batch(raw_constraint_list, self.beam_size, self.bos_id, self.eos_id) if self.global_avoid_trie or any(raw_avoid_list): avoid_states = constrained.AvoidBatch(batch_size, self.beam_size, avoid_list=raw_avoid_list, global_avoid_trie=self.global_avoid_trie) avoid_states.consume(best_word_indices[:, 0]) # constraints operate only on primary target factor # (0) encode source sentence, returns a list model_states, estimated_reference_lengths = self._inference.encode_and_initialize(source, source_length) # repeat states to beam_size model_states = _repeat_states(model_states, self.beam_size, self._inference.state_structure()) # repeat estimated_reference_lengths to shape (batch_size * beam_size, 1) estimated_reference_lengths = np.repeat(estimated_reference_lengths, self.beam_size, axis=0) # Records items in the beam that are inactive. At the beginning (t==1), there is only one valid or active # item on the beam for each sentence inactive = np.zeros((batch_size * self.beam_size, 1), dtype='int32', ctx=self.context) t = 1 for t in range(1, max_iterations + 1): # max_iterations + 1 required to get correct results # (1) obtain next predictions and advance models' state # target_dists: (batch_size * beam_size, target_vocab_size) target_dists, model_states, target_factors = self._inference.decode_step(best_word_indices, model_states, vocab_slice_ids) # (2) Produces the accumulated cost of target words in each row. # There is special treatment for finished and inactive rows: inactive rows are inf everywhere; # finished rows are inf everywhere except column zero, which holds the accumulated model score scores, lengths = self._update_scores(target_dists, finished, inactive, scores_accumulated, lengths, max_output_lengths, unk_dist, pad_dist, eos_dist) # Mark entries that should be blocked as having a score of np.inf if self.global_avoid_trie or any(raw_avoid_list): block_indices = avoid_states.avoid() if len(block_indices) > 0: scores[block_indices] = np.inf if self._sample is not None: target_dists[block_indices] = np.inf # (3) Get beam_size winning hypotheses for each sentence block separately. Only look as # far as the active beam size for each sentence. if self._sample is not None: best_hyp_indices, best_word_indices, scores_accumulated = self._sample(scores, target_dists, finished, sample_best_hyp_indices) else: # On the first timestep, all hypotheses have identical histories, so force topk() to choose extensions # of the first row only by setting all other rows to inf if t == 1: scores += first_step_mask best_hyp_indices, best_word_indices, scores_accumulated = self._top(scores, offset) # Constraints for constrained decoding are processed sentence by sentence if any(raw_constraint_list): best_hyp_indices, best_word_indices, scores_accumulated, constraints, inactive = constrained.topk( t, batch_size, self.beam_size, inactive, scores, constraints, best_hyp_indices, best_word_indices, scores_accumulated) # Map from restricted to full vocab ids if needed if restrict_lexicon: best_word_indices = np.take(vocab_slice_ids, best_word_indices, axis=0) # (4) Normalize the scores of newly finished hypotheses. Note that after this until the # next call to topk(), hypotheses may not be in sorted order. _sort_inputs = [best_hyp_indices, best_word_indices, finished, scores_accumulated, lengths, estimated_reference_lengths] if target_factors is not None: _sort_inputs.append(target_factors) best_word_indices, finished, scores_accumulated, lengths, estimated_reference_lengths = \ self._sort_norm_and_update_finished(*_sort_inputs) # Collect best hypotheses, best word indices best_word_indices_list.append(best_word_indices) best_hyp_indices_list.append(best_hyp_indices) if self._should_stop(finished, batch_size): break # (5) update models' state with winning hypotheses (ascending) model_states = self._sort_states(best_hyp_indices, *model_states) logger.debug("Finished after %d out of %d steps.", t, max_iterations) # (9) Sort the hypotheses within each sentence (normalization for finished hyps may have unsorted them). scores_accumulated_shape = scores_accumulated.shape folded_accumulated_scores = scores_accumulated.reshape((batch_size, -1)) indices = np.argsort(folded_accumulated_scores.astype('float32', copy=False), axis=1).reshape((-1,)) best_hyp_indices = np.unravel_index(indices, scores_accumulated_shape)[0].astype('int32') + offset scores_accumulated = scores_accumulated.take(best_hyp_indices, axis=0) best_hyp_indices_list.append(best_hyp_indices) lengths = lengths.take(best_hyp_indices, axis=0) all_best_hyp_indices = np.stack(best_hyp_indices_list, axis=1) all_best_word_indices = np.stack(best_word_indices_list, axis=2) constraints = [constraints[x] for x in best_hyp_indices.tolist()] return all_best_hyp_indices, \ all_best_word_indices, \ scores_accumulated, \ lengths.astype('int32', copy=False), \ estimated_reference_lengths, \ constraints