def _build_model( self, init_params, ): model = seq2seq_util.ModelHelper(init_params=init_params, ) self.encoder_inputs = model.net.AddExternalInput('encoder_inputs') self.encoder_lengths = model.net.AddExternalInput('encoder_lengths') self.decoder_inputs = model.net.AddExternalInput('decoder_inputs') self.decoder_lengths = model.net.AddExternalInput('decoder_lengths') self.targets = model.net.AddExternalInput('targets') self.target_weights = model.net.AddExternalInput('target_weights') optimizer_params = self.model_params['optimizer_params'] attention_type = self.model_params['attention'] assert attention_type in ['none', 'regular'] self.learning_rate = model.AddParam( name='learning_rate', init_value=float(optimizer_params['learning_rate']), trainable=False, ) self.global_step = model.AddParam( name='global_step', init_value=0, trainable=False, ) self.start_time = model.AddParam( name='start_time', init_value=time.time(), trainable=False, ) assert self.num_gpus < 2 assert len(self.encoder_params['encoder_layer_configs']) == 1 assert len(self.model_params['decoder_layer_configs']) == 1 encoder_num_units = ( self.encoder_params['encoder_layer_configs'][0]['num_units']) decoder_num_units = ( self.model_params['decoder_layer_configs'][0]['num_units']) ( encoder_outputs, final_encoder_hidden_state, final_encoder_cell_state, ) = self._embedding_encoder( model=model, encoder_type=self.encoder_type, encoder_params=self.encoder_params, inputs=self.encoder_inputs, input_lengths=self.encoder_lengths, vocab_size=self.source_vocab_size, embedding_size=self.model_params['encoder_embedding_size'], use_attention=(attention_type != 'none'), ) # For bidirectional RNN, the num of units doubles after encodeing if (self.encoder_type == 'rnn' and self.encoder_params['use_bidirectional_encoder']): encoder_num_units *= 2 if attention_type == 'none': decoder_initial_hidden_state = model.FC( final_encoder_hidden_state, 'decoder_initial_hidden_state', encoder_num_units, decoder_num_units, axis=2, ) decoder_initial_cell_state = model.FC( final_encoder_cell_state, 'decoder_initial_cell_state', encoder_num_units, decoder_num_units, axis=2, ) else: decoder_initial_hidden_state = model.param_init_net.ConstantFill( [], 'decoder_initial_hidden_state', shape=[decoder_num_units], value=0.0, ) decoder_initial_cell_state = model.param_init_net.ConstantFill( [], 'decoder_initial_cell_state', shape=[decoder_num_units], value=0.0, ) initial_attention_weighted_encoder_context = ( model.param_init_net.ConstantFill( [], 'initial_attention_weighted_encoder_context', shape=[encoder_num_units], value=0.0, )) sqrt3 = math.sqrt(3) decoder_embeddings = model.AddParam( name='decoder_embeddings', init=('UniformFill', dict( shape=[ self.target_vocab_size, self.model_params['decoder_embedding_size'], ], min=-sqrt3, max=sqrt3, )), ) embedded_decoder_inputs = model.net.Gather( [decoder_embeddings, self.decoder_inputs], ['embedded_decoder_inputs'], ) # seq_len x batch_size x decoder_embedding_size with core.NameScope('', reset=True): if attention_type == 'none': decoder_outputs, _, _, _ = recurrent.LSTM( model=model, input_blob=embedded_decoder_inputs, seq_lengths=self.decoder_lengths, initial_states=( decoder_initial_hidden_state, decoder_initial_cell_state, ), dim_in=self.model_params['decoder_embedding_size'], dim_out=decoder_num_units, scope='decoder', outputs_with_grads=[0], ) decoder_output_size = decoder_num_units else: (decoder_outputs, _, _, _, attention_weighted_encoder_contexts, _) = recurrent.LSTMWithAttention( model=model, decoder_inputs=embedded_decoder_inputs, decoder_input_lengths=self.decoder_lengths, initial_decoder_hidden_state=decoder_initial_hidden_state, initial_decoder_cell_state=decoder_initial_cell_state, initial_attention_weighted_encoder_context=( initial_attention_weighted_encoder_context), encoder_output_dim=encoder_num_units, encoder_outputs=encoder_outputs, decoder_input_dim=self. model_params['decoder_embedding_size'], decoder_state_dim=decoder_num_units, scope='decoder', outputs_with_grads=[0, 4], ) decoder_outputs, _ = model.net.Concat( [decoder_outputs, attention_weighted_encoder_contexts], [ 'states_and_context_combination', '_states_and_context_combination_concat_dims', ], axis=2, ) decoder_output_size = decoder_num_units + encoder_num_units # we do softmax over the whole sequence # (max_length in the batch * batch_size) x decoder embedding size # -1 because we don't know max_length yet decoder_outputs_flattened, _ = model.net.Reshape( [decoder_outputs], [ 'decoder_outputs_flattened', 'decoder_outputs_and_contexts_combination_old_shape', ], shape=[-1, decoder_output_size], ) output_logits = self.output_projection( model=model, decoder_outputs=decoder_outputs_flattened, decoder_output_size=decoder_output_size, target_vocab_size=self.target_vocab_size, decoder_softmax_size=self.model_params['decoder_softmax_size'], ) targets, _ = model.net.Reshape( [self.targets], ['targets', 'targets_old_shape'], shape=[-1], ) target_weights, _ = model.net.Reshape( [self.target_weights], ['target_weights', 'target_weights_old_shape'], shape=[-1], ) output_probs, loss_per_word = model.net.SoftmaxWithLoss( [output_logits, targets, target_weights], ['OutputProbs', 'loss_per_word'], ) num_words = model.net.ReduceFrontSum( target_weights, 'num_words', ) self.total_loss_scalar = model.net.Mul( [loss_per_word, num_words], 'total_loss_scalar', ) self.forward_net = model.net.Clone(name=model.net.Name() + '_forward_only', ) # print loss only in the forward net which evaluates loss after every # epoch self.forward_net.Print([self.total_loss_scalar], []) # Note: average over batch. # It is tricky because of two problems: # 1. ReduceFrontSum from 1-D tensor returns 0-D tensor # 2. If you want to multiply 0-D by 1-D tensor # (by scalar batch_size_inverse_tensor), # you need to use broadcasting. But gradient propogation # is broken for op with broadcasting. # total_loss_scalar, _ = model.net.Reshape( # [total_loss_scalar], # [total_loss_scalar, 'total_loss_scalar_old_shape'], # shape=[1], # ) batch_size_inverse_tensor = (model.param_init_net.ConstantFill( [], 'batch_size_tensor', shape=[], value=1.0 / self.batch_size, )) total_loss_scalar_average = model.net.Mul( [self.total_loss_scalar, batch_size_inverse_tensor], ['total_loss_scalar_average'], ) model.AddGradientOperators([ total_loss_scalar_average, ]) ONE = model.param_init_net.ConstantFill( [], 'ONE', shape=[1], value=1.0, ) logger.info('All trainable variables: ') for param in model.params: param_grad = model.param_to_grad[param] if param in model.param_to_grad: if isinstance(param_grad, core.GradientSlice): param_grad_values = param_grad.values param_grad_values = model.net.Clip( [param_grad_values], [param_grad_values], min=0.0, max=float(self.model_params['max_grad_value']), ) model.net.ScatterWeightedSum( [ param, ONE, param_grad.indices, param_grad_values, model.net.Negative( [self.learning_rate], 'negative_learning_rate', ), ], param, ) else: param_grad = model.net.Clip( [param_grad], [param_grad], min=0.0, max=float(self.model_params['max_grad_value']), ) model.net.WeightedSum( [ param, ONE, param_grad, model.net.Negative( [self.learning_rate], 'negative_learning_rate', ), ], param, ) self.model = model
def test_lstm_with_recurrent_attention( self, encoder_output_length, encoder_output_dim, decoder_input_length, decoder_state_dim, batch_size, gc, dc, ): with core.DeviceScope(gc): model = CNNModelHelper(name="external") ( encoder_outputs, decoder_inputs, decoder_input_lengths, initial_decoder_hidden_state, initial_decoder_cell_state, initial_attention_weighted_encoder_context, ) = model.net.AddExternalInputs( "encoder_outputs", "decoder_inputs", "decoder_input_lengths", "initial_decoder_hidden_state", "initial_decoder_cell_state", "initial_attention_weighted_encoder_context", ) recurrent.LSTMWithAttention( model=model, decoder_inputs=decoder_inputs, decoder_input_lengths=decoder_input_lengths, initial_decoder_hidden_state=initial_decoder_hidden_state, initial_decoder_cell_state=initial_decoder_cell_state, initial_attention_weighted_encoder_context=( initial_attention_weighted_encoder_context), encoder_output_dim=encoder_output_dim, encoder_outputs=encoder_outputs, decoder_input_dim=decoder_state_dim, decoder_state_dim=decoder_state_dim, scope='external/LSTMWithAttention', attention_type=AttentionType.Recurrent) op = model.net._net.op[-1] workspace.RunNetOnce(model.param_init_net) # This is original decoder_inputs after linear layer decoder_input_blob = op.input[0] workspace.FeedBlob( decoder_input_blob, np.random.randn( decoder_input_length, batch_size, decoder_state_dim * 4, ).astype(np.float32)) workspace.FeedBlob( "external/LSTMWithAttention/encoder_outputs_transposed", np.random.randn( batch_size, encoder_output_dim, encoder_output_length, ).astype(np.float32), ) workspace.FeedBlob( "external/LSTMWithAttention/weighted_encoder_outputs", np.random.randn( encoder_output_length, batch_size, encoder_output_dim, ).astype(np.float32), ) workspace.FeedBlob( decoder_input_lengths, np.random.randint(0, decoder_input_length + 1, size=(batch_size, )).astype(np.int32)) workspace.FeedBlob( initial_decoder_hidden_state, np.random.randn(1, batch_size, decoder_state_dim).astype(np.float32)) workspace.FeedBlob( initial_decoder_cell_state, np.random.randn(1, batch_size, decoder_state_dim).astype(np.float32)) workspace.FeedBlob( initial_attention_weighted_encoder_context, np.random.randn(1, batch_size, encoder_output_dim).astype(np.float32)) inputs = [workspace.FetchBlob(name) for name in op.input] self.assertReferenceChecks( device_option=gc, op=op, inputs=inputs, reference=lstm_with_recurrent_attention_reference, grad_reference=None, output_to_grad=None, outputs_to_check=range(6), ) gradients_to_check = [ index for (index, input_name) in enumerate(op.input) if input_name != "decoder_input_lengths" ] for param in gradients_to_check: self.assertGradientChecks( device_option=gc, op=op, inputs=inputs, outputs_to_check=param, outputs_with_grads=[0, 4], threshold=0.01, stepsize=0.001, )
def model_build_fun(self, model, forward_only=False, loss_scale=None): encoder_inputs = model.net.AddExternalInput( workspace.GetNameScope() + 'encoder_inputs', ) encoder_lengths = model.net.AddExternalInput( workspace.GetNameScope() + 'encoder_lengths', ) decoder_inputs = model.net.AddExternalInput( workspace.GetNameScope() + 'decoder_inputs', ) decoder_lengths = model.net.AddExternalInput( workspace.GetNameScope() + 'decoder_lengths', ) targets = model.net.AddExternalInput( workspace.GetNameScope() + 'targets', ) target_weights = model.net.AddExternalInput( workspace.GetNameScope() + 'target_weights', ) attention_type = self.model_params['attention'] assert attention_type in ['none', 'regular'] ( encoder_outputs, weighted_encoder_outputs, final_encoder_hidden_state, final_encoder_cell_state, encoder_output_dim, ) = self._build_embedding_encoder( model=model, inputs=encoder_inputs, input_lengths=encoder_lengths, vocab_size=self.source_vocab_size, embeddings=self.encoder_embeddings, embedding_size=self.model_params['encoder_embedding_size'], use_attention=(attention_type != 'none'), num_gpus=self.num_gpus, forward_only=forward_only, ) assert len(self.model_params['decoder_layer_configs']) == 1 decoder_num_units = ( self.model_params['decoder_layer_configs'][0]['num_units']) if attention_type == 'none': decoder_initial_hidden_state = model.FC( final_encoder_hidden_state, 'decoder_initial_hidden_state', encoder_output_dim, decoder_num_units, axis=2, ) decoder_initial_cell_state = model.FC( final_encoder_cell_state, 'decoder_initial_cell_state', encoder_output_dim, decoder_num_units, axis=2, ) else: decoder_initial_hidden_state = model.param_init_net.ConstantFill( [], 'decoder_initial_hidden_state', shape=[decoder_num_units], value=0.0, ) decoder_initial_cell_state = model.param_init_net.ConstantFill( [], 'decoder_initial_cell_state', shape=[decoder_num_units], value=0.0, ) initial_attention_weighted_encoder_context = ( model.param_init_net.ConstantFill( [], 'initial_attention_weighted_encoder_context', shape=[encoder_output_dim], value=0.0, )) if self.num_gpus == 0: embedded_decoder_inputs = model.net.Gather( [self.decoder_embeddings, decoder_inputs], ['embedded_decoder_inputs'], ) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): embedded_decoder_inputs_cpu = model.net.Gather( [self.decoder_embeddings, decoder_inputs], ['embedded_decoder_inputs_cpu'], ) embedded_decoder_inputs = model.CopyCPUToGPU( embedded_decoder_inputs_cpu, 'embedded_decoder_inputs', ) # seq_len x batch_size x decoder_embedding_size if attention_type == 'none': decoder_outputs, _, _, _ = recurrent.LSTM( model=model, input_blob=embedded_decoder_inputs, seq_lengths=decoder_lengths, initial_states=( decoder_initial_hidden_state, decoder_initial_cell_state, ), dim_in=self.model_params['decoder_embedding_size'], dim_out=decoder_num_units, scope='decoder', outputs_with_grads=[0], ) decoder_output_size = decoder_num_units else: (decoder_outputs, _, _, _, attention_weighted_encoder_contexts, _) = recurrent.LSTMWithAttention( model=model, decoder_inputs=embedded_decoder_inputs, decoder_input_lengths=decoder_lengths, initial_decoder_hidden_state=decoder_initial_hidden_state, initial_decoder_cell_state=decoder_initial_cell_state, initial_attention_weighted_encoder_context=( initial_attention_weighted_encoder_context), encoder_output_dim=encoder_output_dim, encoder_outputs=encoder_outputs, decoder_input_dim=self.model_params['decoder_embedding_size'], decoder_state_dim=decoder_num_units, scope='decoder', outputs_with_grads=[0, 4], ) decoder_outputs, _ = model.net.Concat( [decoder_outputs, attention_weighted_encoder_contexts], [ 'states_and_context_combination', '_states_and_context_combination_concat_dims', ], axis=2, ) decoder_output_size = decoder_num_units + encoder_output_dim # we do softmax over the whole sequence # (max_length in the batch * batch_size) x decoder embedding size # -1 because we don't know max_length yet decoder_outputs_flattened, _ = model.net.Reshape( [decoder_outputs], [ 'decoder_outputs_flattened', 'decoder_outputs_and_contexts_combination_old_shape', ], shape=[-1, decoder_output_size], ) output_logits = self.output_projection( model=model, decoder_outputs=decoder_outputs_flattened, decoder_output_size=decoder_output_size, target_vocab_size=self.target_vocab_size, decoder_softmax_size=self.model_params['decoder_softmax_size'], ) targets, _ = model.net.Reshape( [targets], ['targets', 'targets_old_shape'], shape=[-1], ) target_weights, _ = model.net.Reshape( [target_weights], ['target_weights', 'target_weights_old_shape'], shape=[-1], ) output_probs = model.net.Softmax( [output_logits], ['output_probs'], engine=('CUDNN' if self.num_gpus > 0 else None), ) label_cross_entropy = model.net.LabelCrossEntropy( [output_probs, targets], ['label_cross_entropy'], ) weighted_label_cross_entropy = model.net.Mul( [label_cross_entropy, target_weights], 'weighted_label_cross_entropy', ) total_loss_scalar = model.net.SumElements( [weighted_label_cross_entropy], 'total_loss_scalar', ) total_loss_scalar_weighted = model.net.Scale( [total_loss_scalar], 'total_loss_scalar_weighted', scale=1.0 / self.batch_size, ) return [total_loss_scalar_weighted]