def _decode(self, tokens, encoder_outs): # wrap in Variable tokens = utils.volatile_variable(tokens) avg_probs = None avg_attn = None for model, encoder_out in zip(self.models, encoder_outs): with utils.maybe_no_grad(): decoder_out, attn = model.decoder(tokens, encoder_out) probs = model.get_normalized_probs(decoder_out[:, -1, :], log_probs=False).data if avg_probs is None: avg_probs = probs else: avg_probs.add_(probs) if attn is not None: attn = attn[:, -1, :].data if avg_attn is None: avg_attn = attn else: avg_attn.add_(attn) avg_probs.div_(len(self.models)) avg_probs.log_() if avg_attn is not None: avg_attn.div_(len(self.models)) return avg_probs, avg_attn
def generate_batched_itr(self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None, cuda=False, timer=None, prefix_size=0): """Iterate over a batched dataset and yield individual translations. Args: maxlen_a/b: generate sequences of maximum length ax + b, where x is the source sentence length. cuda: use GPU for generation timer: StopwatchMeter for timing generations. """ if maxlen_b is None: maxlen_b = self.maxlen for sample in data_itr: s = utils.make_variable(sample, volatile=True, cuda=cuda) input = s['net_input'] srclen = input['src_tokens'].size(1) if timer is not None: timer.start() with utils.maybe_no_grad(): hypos = self.generate( input['src_tokens'], input['src_lengths'], beam_size=beam_size, maxlen=int(maxlen_a*srclen + maxlen_b), prefix_tokens=s['target'][:, :prefix_size] if prefix_size > 0 else None, ) if timer is not None: timer.stop(sum([len(h[0]['tokens']) for h in hypos])) for i, id in enumerate(s['id'].data): src = input['src_tokens'].data[i, :] # remove padding from ref ref = utils.strip_pad(s['target'].data[i, :], self.pad) if s['target'] is not None else None yield id, src, ref, hypos[i]
def forward(self, input, incremental_state=None): """ Input: Time x Batch x Channel. Args: incremental_state: Used to buffer signal; if not None, then input is expected to contain a single frame. If the input order changes between time steps, call reorder_incremental_state. """ if incremental_state is None: return super().forward(input) # reshape weight weight = self._get_linearized_weight() kw = self.kernel_size[0] bsz = input.size(0) # input: bsz x len x dim if kw > 1: input = input.data input_buffer = self._get_input_buffer(incremental_state) if input_buffer is None: input_buffer = input.new(bsz, kw, input.size(2)).zero_() self._set_input_buffer(incremental_state, input_buffer) else: # shift buffer input_buffer[:, :-1, :] = input_buffer[:, 1:, :].clone() # append next input input_buffer[:, -1, :] = input[:, -1, :] input = utils.volatile_variable(input_buffer) with utils.maybe_no_grad(): output = F.linear(input.view(bsz, -1), weight, self.bias) return output.view(bsz, 1, -1)
def _decode(self, tokens, encoder_outs, incremental_states): # wrap in Variable tokens = utils.volatile_variable(tokens) avg_probs = None avg_attn = None for model, encoder_out in zip(self.models, encoder_outs): with utils.maybe_no_grad(): if incremental_states[model] is not None: decoder_out = list( model.decoder(tokens, encoder_out, incremental_states[model])) else: decoder_out = list(model.decoder(tokens, encoder_out)) decoder_out[0] = decoder_out[0][:, -1, :] attn = decoder_out[1] probs = model.get_normalized_probs(decoder_out, log_probs=False).data if avg_probs is None: avg_probs = probs else: avg_probs.add_(probs) if attn is not None: attn = attn[:, -1, :].data if avg_attn is None: avg_attn = attn else: avg_attn.add_(attn) avg_probs.div_(len(self.models)) avg_probs.log_() if avg_attn is not None: avg_attn.div_(len(self.models)) return avg_probs, avg_attn
def _decode(self, tokens, encoder_outs, src_doctopic_reshaped, incremental_states): # print(tokens, encoder_outs, src_doctopic_reshaped.size(), incremental_states) # wrap in Variable tokens = utils.volatile_variable(tokens) avg_probs = None avg_attn = None for model, encoder_out in zip(self.models, encoder_outs): with utils.maybe_no_grad(): decoder_out, attn = model.decoder(tokens, encoder_out, src_doctopic_reshaped, incremental_states[model]) probs = model.get_normalized_probs(decoder_out[:, -1, :], log_probs=False).data if avg_probs is None: avg_probs = probs else: avg_probs.add_(probs) if attn is not None: attn = attn[:, -1, :].data if avg_attn is None: avg_attn = attn else: avg_attn.add_(attn) avg_probs.div_(len(self.models)) avg_probs.log_() if avg_attn is not None: avg_attn.div_(len(self.models)) return avg_probs, avg_attn
def _async_forward(self, rank, device_id, eval=False): if eval: self.model.eval() else: self.model.train() self.optimizer.zero_grad() with utils.maybe_no_grad(eval): sample_size, logging_output, oom = 0, {}, False if self._sample is not None: try: # calculate loss and sample size self.loss, sample_size, logging_output = self.criterion( self.model, self._sample) except RuntimeError as e: if not eval and 'out of memory' in str(e): print( '| WARNING: ran out of memory on GPU #{}, skipping batch' .format(device_id)) oom = True self.loss = None if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: raise e return sample_size, logging_output, oom
def generate( self, src_tokens, src_lengths, beam_size=None, maxlen=None, prefix_tokens=None ): """Generate a batch of translations.""" with utils.maybe_no_grad(): return self._generate( src_tokens, src_lengths, beam_size, maxlen, prefix_tokens )
def generate_batched_itr( self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None, cuda=False, timer=None, prefix_size=0, ): """Iterate over a batched dataset and yield individual translations. Args: maxlen_a/b: generate sequences of maximum length ax + b, where x is the source sentence length. cuda: use GPU for generation timer: StopwatchMeter for timing generations. """ if maxlen_b is None: maxlen_b = self.maxlen for sample in data_itr: s = utils.make_variable(sample, volatile=True, cuda=cuda) input = s["net_input"] # Take the max source length to compute the max target length srclen = input["src_tokens"].size(1) # FIXME: handle characters properly if self.use_char_source: raise ValueError( "Character level encoder is not supported yet for " "multisource sentences." ) encoder_inputs = (input["src_tokens"], input["src_lengths"]) if timer is not None: timer.start() with utils.maybe_no_grad(): hypos = self.generate( encoder_inputs, srcs_ids=input["src_ids"], beam_size=beam_size, maxlen=int(maxlen_a * srclen + maxlen_b), prefix_tokens=s["target"][:, :prefix_size] if prefix_size > 0 else None, ) if timer is not None: timer.stop(s["ntokens"]) for i, id in enumerate(s["id"]): src = input["src_tokens"].index_select( 0, input['src_ids'][self.align_to] ) # remove padding from ref ref = utils.strip_pad(s["target"][i, :], self.pad) yield id, src, ref, hypos[i]
def generate_batched_itr( self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None, cuda=False, timer=None, prefix_size=0, ): """Iterate over a batched dataset and yield individual translations. Args: maxlen_a/b: generate sequences of maximum length ax + b, where x is the source sentence length. cuda: use GPU for generation timer: StopwatchMeter for timing generations. """ if maxlen_b is None: maxlen_b = self.maxlen for sample in data_itr: s = utils.make_variable(sample, volatile=True, cuda=cuda) input = s["net_input"] srclen = input["src_tokens"].size(1) if self.use_char_source: encoder_input = ( input["src_tokens"], input["src_lengths"], input["char_inds"], input["word_lengths"], ) else: encoder_input = (input["src_tokens"], input["src_lengths"]) if timer is not None: timer.start() with utils.maybe_no_grad(): hypos = self.generate( encoder_input, beam_size=beam_size, maxlen=int(maxlen_a * srclen + maxlen_b), prefix_tokens=s["target"][:, :prefix_size] if prefix_size > 0 else None, ) if timer is not None: timer.stop(s["ntokens"]) for i, id in enumerate(s["id"].data): src = input["src_tokens"].data[i, :] # remove padding from ref ref = utils.strip_pad(s["target"].data[i, :], self.pad) yield id, src, ref, hypos[i]
def generate( self, encoder_inputs, srcs_ids, beam_size=None, maxlen=None, prefix_tokens=None, src_weights=None, ): """Generate a batch of translations.""" with utils.maybe_no_grad(): return self._generate( encoder_inputs, srcs_ids, beam_size, maxlen, prefix_tokens, src_weights )
def _forward(self, sample, eval=False): # prepare model and optimizer if eval: self.model.eval() else: self.model.train() self.optimizer.zero_grad() loss = None sample_size = 0 logging_output = { 'ntokens': sample['ntokens'] if sample is not None else 0, 'nsentences': sample['target'].size(0) if sample is not None else 0, } oom = 0 if sample is not None: try: with utils.maybe_no_grad(eval): # calculate loss and sample size loss, sample_size, logging_output_ = self.criterion( self.model, sample) logging_output.update(logging_output_) except RuntimeError as e: if not eval and 'out of memory' in str(e): print('| WARNING: ran out of memory, skipping batch') oom = 1 loss = None if hasattr(torch.cuda, 'empty_cache'): torch.cuda.empty_cache() else: raise e # synchronize logging outputs for multi-GPU training if self.args.distributed_world_size > 1: sample_sizes, logging_outputs, ooms = zip(*list( distributed_utils.all_gather_list((sample_size, logging_output, oom)))) ooms = sum(ooms) else: sample_sizes = [sample_size] logging_outputs = [logging_output] ooms = oom return loss, sample_sizes, logging_outputs, ooms
def _decode(self, tokens, encoder_outs, incremental_states, n_srcs=1): # wrap in Variable tokens = utils.volatile_variable(tokens) # Source sentences are weighted equally (for now) srcs_weights = [1 / n_srcs] * n_srcs avg_probs = None avg_attn = None for src_id, src_weight in enumerate(srcs_weights): for model_id, (model_weight, model) in enumerate( zip(self.model_weights, self.models) ): with utils.maybe_no_grad(): encoder_out = encoder_outs[src_id][model_id] incremental_state = incremental_states[(src_id, model_id)] decoder_out = list( model.decoder(tokens, encoder_out, incremental_state) ) decoder_out[0] = decoder_out[0][:, -1, :] attn = decoder_out[1] if len(decoder_out) == 3: possible_translation_tokens = decoder_out[2] else: possible_translation_tokens = None probs = ( src_weight * model_weight * model.get_normalized_probs(decoder_out, log_probs=False) ) if avg_probs is None: avg_probs = probs else: avg_probs.add_(probs) if attn is not None and src_id == self.align_to: attn = attn[:, -1, :] if avg_attn is None: avg_attn = attn else: avg_attn.add_(attn) avg_probs.log_() if avg_attn is not None: avg_attn.div_(len(self.models)) return avg_probs, avg_attn, possible_translation_tokens
def score(self, sample): """Score a batch of translations.""" net_input = sample['net_input'] # compute scores for each model in the ensemble avg_probs = None avg_attn = None for model in self.models: with utils.maybe_no_grad(): model.eval() encoder_out = model.encoder( net_input['src_tokens'], net_input['src_lengths'], ) decoder_out = model.decoder( net_input['prev_output_tokens'], encoder_out, ) attn = decoder_out[1] probs = model.get_normalized_probs(decoder_out, log_probs=False).data if avg_probs is None: avg_probs = probs else: avg_probs.add_(probs) if attn is not None: attn = attn.data if avg_attn is None: avg_attn = attn else: avg_attn.add_(attn) avg_probs.div_(len(self.models)) avg_probs.log_() if avg_attn is not None: avg_attn.div_(len(self.models)) avg_probs = avg_probs.gather( dim=2, index=sample['target'].data.unsqueeze(-1), ) return avg_probs.squeeze(2), avg_attn
def generate_batched_itr(self, data_itr, beam_size=None, maxlen_a=0.0, maxlen_b=None, cuda_device=None, timer=None): """Iterate over a batched dataset and yield individual translations. Args: maxlen_a/b: generate sequences of maximum length ax + b, where x is the source sentence length. cuda_device: GPU on which to do generation. timer: StopwatchMeter for timing generations. """ if maxlen_b is None: maxlen_b = self.maxlen for sample in data_itr: s = utils.make_variable(sample, volatile=True, cuda_device=cuda_device) input = s['net_input'] srclen = input['src_tokens'].size(1) if timer is not None: timer.start() with utils.maybe_no_grad(): hypos = self.generate(input['src_tokens'], beam_size=beam_size, maxlen=int(maxlen_a * srclen + maxlen_b)) if timer is not None: timer.stop(s['ntokens']) for i, id in enumerate(s['id'].data): src = input['src_tokens'].data[i, :] # remove padding from ref ref = utils.strip_pad(s['target'].data[i, :], self.pad) yield id, src, ref, hypos[i]
def _decode(self, tokens, encoder_outs, incremental_states): # wrap in Variable tokens = utils.volatile_variable(tokens) avg_probs = None avg_attn = None for model_weight, model, encoder_out in zip( self.model_weights, self.models, encoder_outs ): with utils.maybe_no_grad(): decoder_out = list( model.decoder(tokens, encoder_out, incremental_states[model]) ) decoder_out[0] = decoder_out[0][:, -1, :] attn = decoder_out[1] if len(decoder_out) == 3: possible_translation_tokens = decoder_out[2] else: possible_translation_tokens = None probs = model_weight * model.get_normalized_probs( decoder_out, log_probs=False ) if avg_probs is None: avg_probs = probs else: avg_probs.add_(probs) if attn is not None: attn = attn[:, -1, :] if avg_attn is None: avg_attn = attn else: avg_attn.add_(attn) avg_probs.log_() if avg_attn is not None: avg_attn.div_(len(self.models)) return avg_probs, avg_attn, possible_translation_tokens