def _build_model( self, init_params, ): model = Seq2SeqModelHelper(init_params=init_params) self._build_shared(model) self._build_embeddings(model) forward_model = Seq2SeqModelHelper(init_params=init_params) self._build_shared(forward_model) self._build_embeddings(forward_model) if self.num_gpus == 0: loss_blobs = self.model_build_fun(model) model.AddGradientOperators(loss_blobs) self.norm_clipped_grad_update( model, scope='norm_clipped_grad_update' ) self.forward_model_build_fun(forward_model) else: assert (self.batch_size % self.num_gpus) == 0 data_parallel_model.Parallelize_GPU( forward_model, input_builder_fun=lambda m: None, forward_pass_builder_fun=self.forward_model_build_fun, param_update_builder_fun=None, devices=list(range(self.num_gpus)), ) def clipped_grad_update_bound(model): self.norm_clipped_grad_update( model, scope='norm_clipped_grad_update', ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=lambda m: None, forward_pass_builder_fun=self.model_build_fun, param_update_builder_fun=clipped_grad_update_bound, devices=list(range(self.num_gpus)), ) self.norm_clipped_sparse_grad_update( model, scope='norm_clipped_sparse_grad_update', ) self.model = model self.forward_net = forward_model.net
def __init__(self, beam_size, model, go_token_id, eos_token_id): self.beam_size = beam_size self.model = model self.step_model = Seq2SeqModelHelper( name='step_model', param_model=self.model, ) self.go_token_id = go_token_id self.eos_token_id = eos_token_id ( self.timestep, self.scores_t_prev, self.tokens_t_prev, self.hypo_t_prev, self.attention_t_prev, ) = self.step_model.net.AddExternalInputs( 'timestep', 'scores_t_prev', 'tokens_t_prev', 'hypo_t_prev', 'attention_t_prev', ) tokens_t_prev_int32 = self.step_model.net.Cast( self.tokens_t_prev, 'tokens_t_prev_int32', to=core.DataType.INT32, ) self.tokens_t_prev_int32_flattened, _ = self.step_model.net.Reshape( [tokens_t_prev_int32], [tokens_t_prev_int32, 'input_t_int32_old_shape'], shape=[1, -1], )
def __init__( self, translate_params, ): self.models = translate_params['ensemble_models'] decoding_params = translate_params['decoding_params'] self.beam_size = decoding_params['beam_size'] assert len(self.models) > 0 source_vocab = self.models[0]['source_vocab'] target_vocab = self.models[0]['target_vocab'] for model in self.models: assert model['source_vocab'] == source_vocab assert model['target_vocab'] == target_vocab self.source_vocab_size = len(source_vocab) self.target_vocab_size = len(target_vocab) self.decoder_scope_names = [ 'model{}'.format(i) for i in range(len(self.models)) ] self.model = Seq2SeqModelHelper(init_params=True) self.encoder_inputs = self.model.net.AddExternalInput('encoder_inputs') self.encoder_lengths = self.model.net.AddExternalInput( 'encoder_lengths') self.max_output_seq_len = self.model.net.AddExternalInput( 'max_output_seq_len') fake_seq_lengths = self.model.param_init_net.ConstantFill( [], 'fake_seq_lengths', shape=[self.beam_size], value=100000, dtype=core.DataType.INT32, ) beam_decoder = BeamSearchForwardOnly( beam_size=self.beam_size, model=self.model, go_token_id=seq2seq_util.GO_ID, eos_token_id=seq2seq_util.EOS_ID, ) step_model = beam_decoder.get_step_model() state_configs = [] output_log_probs = [] attention_weights = [] for model, scope_name in zip( self.models, self.decoder_scope_names, ): ( state_configs_per_decoder, output_log_probs_per_decoder, attention_weights_per_decoder, ) = self._build_decoder( model=self.model, step_model=step_model, model_params=model['model_params'], scope=scope_name, previous_tokens=beam_decoder.get_previous_tokens(), timestep=beam_decoder.get_timestep(), fake_seq_lengths=fake_seq_lengths, ) state_configs.extend(state_configs_per_decoder) output_log_probs.append(output_log_probs_per_decoder) if attention_weights_per_decoder is not None: attention_weights.append(attention_weights_per_decoder) assert len(attention_weights) > 0 num_decoders_with_attention_blob = ( self.model.param_init_net.ConstantFill( [], 'num_decoders_with_attention_blob', value=1 / float(len(attention_weights)), shape=[1], )) # [beam_size, encoder_length, 1] attention_weights_average = _weighted_sum( model=step_model, values=attention_weights, weight=num_decoders_with_attention_blob, output_name='attention_weights_average', ) num_decoders_blob = self.model.param_init_net.ConstantFill( [], 'num_decoders_blob', value=1 / float(len(output_log_probs)), shape=[1], ) # [beam_size, target_vocab_size] output_log_probs_average = _weighted_sum( model=step_model, values=output_log_probs, weight=num_decoders_blob, output_name='output_log_probs_average', ) word_rewards = self.model.param_init_net.ConstantFill( [], 'word_rewards', shape=[self.target_vocab_size], value=0, ) ( self.output_token_beam_list, self.output_prev_index_beam_list, self.output_score_beam_list, self.output_attention_weights_beam_list, ) = beam_decoder.apply( inputs=self.encoder_inputs, length=self.max_output_seq_len, log_probs=output_log_probs_average, attentions=attention_weights_average, state_configs=state_configs, data_dependencies=[], word_rewards=word_rewards, ) workspace.RunNetOnce(self.model.param_init_net) workspace.FeedBlob( 'word_rewards', self.build_word_rewards( vocab_size=self.target_vocab_size, word_reward=translate_params['decoding_params']['word_reward'], unk_reward=translate_params['decoding_params']['unk_reward'], )) workspace.CreateNet( self.model.net, input_blobs=[ str(self.encoder_inputs), str(self.encoder_lengths), str(self.max_output_seq_len), ], ) logger.info('Params created: ') for param in self.model.params: logger.info(param)