def model_build_fun(model, loss_scale): workspace.FeedBlob( core.ScopedBlobReference("seq_lengths"), np.array([self.T] * self.batch_per_device, dtype=np.int32)) model.param_init_net.ConstantFill( [], "hidden_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim]) model.param_init_net.ConstantFill( [], "cell_init", value=0.0, shape=[1, self.batch_per_device, self.hidden_dim]) output, _last_hidden, _, _last_state, = recurrent.LSTM( model=model, input_blob="data", seq_lengths="seq_lengths", initial_states=("hidden_init", "cell_init"), dim_in=self.input_dim, dim_out=self.hidden_dim, scope="partest", ) # A silly loss function loss = model.AveragedLoss( model.Sub([output, "target"], "dist"), "loss", ) loss = model.Scale(loss, "loss_scaled", scale=loss_scale) return [loss]
def create_model(args, queue): model = cnn.CNNModelHelper(name="LSTM_bench") seq_lengths, hidden_init, cell_init, target = \ model.net.AddExternalInputs( 'seq_lengths', 'hidden_init', 'cell_init', 'target', ) input_blob = model.DequeueBlobs(queue, "input_data") all_hidden, last_hidden, _, last_state = recurrent.LSTM( model=model, input_blob=input_blob, seq_lengths=seq_lengths, initial_states=(hidden_init, cell_init), dim_in=args.input_dim, dim_out=args.hidden_dim, scope="lstm1", ) model.AddGradientOperators([all_hidden]) # carry states over model.net.Copy(last_hidden, hidden_init) model.net.Copy(last_hidden, cell_init) workspace.FeedBlob( hidden_init, np.zeros([1, args.batch_size, args.hidden_dim], dtype=np.float32)) workspace.FeedBlob( cell_init, np.zeros([1, args.batch_size, args.hidden_dim], dtype=np.float32)) return model
def create_lstm( model, input_blob, seq_lengths, init, dim_in, dim_out, scope): recurrent.LSTM( model, input_blob, seq_lengths, init, dim_in, dim_out, scope="external/recurrent", outputs_with_grads=outputs_with_grads)
def create_model(args, queue, label_queue, input_shape): model = cnn.CNNModelHelper(name="LSTM_bench") seq_lengths, hidden_init, cell_init, target = \ model.net.AddExternalInputs( 'seq_lengths', 'hidden_init', 'cell_init', 'target', ) input_blob = model.DequeueBlobs(queue, "input_data") labels = model.DequeueBlobs(label_queue, "label") if args.implementation == "own": output, last_hidden, _, last_state = recurrent.LSTM( model=model, input_blob=input_blob, seq_lengths=seq_lengths, initial_states=(hidden_init, cell_init), dim_in=args.input_dim, dim_out=args.hidden_dim, scope="lstm1", memory_optimization=args.memory_optimization, ) elif args.implementation == "cudnn": # We need to feed a placeholder input so that RecurrentInitOp # can infer the dimensions. model.param_init_net.ConstantFill([], input_blob, shape=input_shape) output, last_hidden, _ = recurrent.cudnn_LSTM( model=model, input_blob=input_blob, initial_states=(hidden_init, cell_init), dim_in=args.input_dim, dim_out=args.hidden_dim, scope="cudnnlstm", ) else: assert False, "Unknown implementation" weights = model.UniformFill(labels, "weights") softmax, loss = model.SoftmaxWithLoss( [model.Flatten(output), labels, weights], ['softmax', 'loss'], ) model.AddGradientOperators([loss]) # carry states over model.net.Copy(last_hidden, hidden_init) model.net.Copy(last_hidden, cell_init) workspace.FeedBlob( hidden_init, np.zeros([1, args.batch_size, args.hidden_dim], dtype=np.float32)) workspace.FeedBlob( cell_init, np.zeros([1, args.batch_size, args.hidden_dim], dtype=np.float32)) return model, output
def rnn_unidirectional_encoder(model, embedded_inputs, input_lengths, initial_hidden_state, initial_cell_state, embedding_size, encoder_num_units, use_attention): """ Unidirectional (forward pass) LSTM encoder.""" outputs, final_hidden_state, _, final_cell_state = recurrent.LSTM( model=model, input_blob=embedded_inputs, seq_lengths=input_lengths, initial_states=(initial_hidden_state, initial_cell_state), dim_in=embedding_size, dim_out=encoder_num_units, scope='encoder', outputs_with_grads=([0] if use_attention else [1, 3]), ) return outputs, final_hidden_state, final_cell_state
def rnn_bidirectional_encoder( model, embedded_inputs, input_lengths, initial_hidden_state, initial_cell_state, embedding_size, encoder_num_units, use_attention ): """ Bidirectional (forward pass and backward pass) LSTM encoder.""" # Forward pass ( outputs_fw, final_hidden_state_fw, _, final_cell_state_fw, ) = recurrent.LSTM( model=model, input_blob=embedded_inputs, seq_lengths=input_lengths, initial_states=(initial_hidden_state, initial_cell_state), dim_in=embedding_size, dim_out=encoder_num_units, scope='forward_encoder', outputs_with_grads=([0] if use_attention else [1, 3]), ) # Backward pass reversed_embedded_inputs = model.net.ReversePackedSegs( [embedded_inputs, input_lengths], ['reversed_embedded_inputs'], ) ( outputs_bw, final_hidden_state_bw, _, final_cell_state_bw, ) = recurrent.LSTM( model=model, input_blob=reversed_embedded_inputs, seq_lengths=input_lengths, initial_states=(initial_hidden_state, initial_cell_state), dim_in=embedding_size, dim_out=encoder_num_units, scope='backward_encoder', outputs_with_grads=([0] if use_attention else [1, 3]), ) outputs_bw = model.net.ReversePackedSegs( [outputs_bw, input_lengths], ['outputs_bw'], ) # Concatenate forward and backward results outputs, _ = model.net.Concat( [outputs_fw, outputs_bw], ['outputs', 'outputs_dim'], axis=2, ) final_hidden_state, _ = model.net.Concat( [final_hidden_state_fw, final_hidden_state_bw], ['final_hidden_state', 'final_hidden_state_dim'], axis=2, ) final_cell_state, _ = model.net.Concat( [final_cell_state_fw, final_cell_state_bw], ['final_cell_state', 'final_cell_state_dim'], axis=2, ) return outputs, final_hidden_state, final_cell_state
def testEqualToCudnn(self): with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA)): T = 8 batch_size = 4 input_dim = 8 hidden_dim = 31 workspace.FeedBlob("seq_lengths", np.array([T] * batch_size, dtype=np.int32)) workspace.FeedBlob( "target", np.zeros([T, batch_size, hidden_dim], dtype=np.float32)) workspace.FeedBlob( "hidden_init", np.zeros([1, batch_size, hidden_dim], dtype=np.float32)) workspace.FeedBlob( "cell_init", np.zeros([1, batch_size, hidden_dim], dtype=np.float32)) own_model = cnn.CNNModelHelper(name="own_lstm") input_shape = [T, batch_size, input_dim] cudnn_model = cnn.CNNModelHelper(name="cudnn_lstm") input_blob = cudnn_model.param_init_net.UniformFill( [], "input", shape=input_shape) workspace.FeedBlob( "CUDNN/hidden_init_cudnn", np.zeros([1, batch_size, hidden_dim], dtype=np.float32)) workspace.FeedBlob( "CUDNN/cell_init_cudnn", np.zeros([1, batch_size, hidden_dim], dtype=np.float32)) cudnn_output, cudnn_last_hidden, _, param_extract = recurrent.cudnn_LSTM( model=cudnn_model, input_blob=input_blob, initial_states=("hidden_init_cudnn", "hidden_init_cudnn"), dim_in=input_dim, dim_out=hidden_dim, scope="CUDNN", return_params=True, ) cudnn_loss = cudnn_model.AveragedLoss( cudnn_model.SquaredL2Distance([cudnn_output, "target"], "CUDNN/dist"), "CUDNN/loss") own_output, own_last_hidden, _, last_state, own_params = recurrent.LSTM( model=own_model, input_blob=input_blob, seq_lengths="seq_lengths", initial_states=("hidden_init", "cell_init"), dim_in=input_dim, dim_out=hidden_dim, scope="OWN", return_params=True, ) own_loss = own_model.AveragedLoss( own_model.SquaredL2Distance([own_output, "target"], "OWN/dist"), "OWN/loss") # Add gradients cudnn_model.AddGradientOperators([cudnn_loss]) own_model.AddGradientOperators([own_loss]) # Add parameter updates LR = cudnn_model.param_init_net.ConstantFill([], shape=[1], value=0.01) ONE = cudnn_model.param_init_net.ConstantFill([], shape=[1], value=1.0) for param in cudnn_model.GetParams(): cudnn_model.WeightedSum( [param, ONE, cudnn_model.param_to_grad[param], LR], param) for param in own_model.GetParams(): own_model.WeightedSum( [param, ONE, own_model.param_to_grad[param], LR], param) workspace.RunNetOnce(cudnn_model.param_init_net) workspace.CreateNet(cudnn_model.net) ## ## CUDNN LSTM MODEL EXECUTION ## # Get initial values from CuDNN LSTM so we can feed them # to our own. (param_extract_net, param_extract_mapping) = param_extract workspace.RunNetOnce(param_extract_net) cudnn_lstm_params = {} for input_type, pars in param_extract_mapping.items(): cudnn_lstm_params[input_type] = {} for k, v in pars.items(): cudnn_lstm_params[input_type][k] = workspace.FetchBlob( v[0]) # Run the model 3 times, so that some parameter updates are done workspace.RunNet(cudnn_model.net.Proto().name, 3) ## ## OWN LSTM MODEL EXECUTION ## # Map the cuDNN parameters to our own workspace.RunNetOnce(own_model.param_init_net) recurrent.InitFromLSTMParams(own_params, cudnn_lstm_params) # Run the model 3 times, so that some parameter updates are done workspace.CreateNet(own_model.net) workspace.RunNet(own_model.net.Proto().name, 3) ## ## COMPARE RESULTS ## # Then compare that final results after 3 runs are equal own_output_data = workspace.FetchBlob(own_output) own_last_hidden = workspace.FetchBlob(own_last_hidden) own_loss = workspace.FetchBlob(own_loss) cudnn_output_data = workspace.FetchBlob(cudnn_output) cudnn_last_hidden = workspace.FetchBlob(cudnn_last_hidden) cudnn_loss = workspace.FetchBlob(cudnn_loss) self.assertTrue(np.allclose(own_output_data, cudnn_output_data)) self.assertTrue(np.allclose(own_last_hidden, cudnn_last_hidden)) self.assertTrue(np.allclose(own_loss, cudnn_loss))
def _build_model( self, init_params, ): model = seq2seq_util.ModelHelper(init_params=init_params, ) self.encoder_inputs = model.net.AddExternalInput('encoder_inputs') self.encoder_lengths = model.net.AddExternalInput('encoder_lengths') self.decoder_inputs = model.net.AddExternalInput('decoder_inputs') self.decoder_lengths = model.net.AddExternalInput('decoder_lengths') self.targets = model.net.AddExternalInput('targets') self.target_weights = model.net.AddExternalInput('target_weights') optimizer_params = self.model_params['optimizer_params'] attention_type = self.model_params['attention'] assert attention_type in ['none', 'regular'] self.learning_rate = model.AddParam( name='learning_rate', init_value=float(optimizer_params['learning_rate']), trainable=False, ) self.global_step = model.AddParam( name='global_step', init_value=0, trainable=False, ) self.start_time = model.AddParam( name='start_time', init_value=time.time(), trainable=False, ) assert self.num_gpus < 2 assert len(self.encoder_params['encoder_layer_configs']) == 1 assert len(self.model_params['decoder_layer_configs']) == 1 encoder_num_units = ( self.encoder_params['encoder_layer_configs'][0]['num_units']) decoder_num_units = ( self.model_params['decoder_layer_configs'][0]['num_units']) ( encoder_outputs, final_encoder_hidden_state, final_encoder_cell_state, ) = self._embedding_encoder( model=model, encoder_type=self.encoder_type, encoder_params=self.encoder_params, inputs=self.encoder_inputs, input_lengths=self.encoder_lengths, vocab_size=self.source_vocab_size, embedding_size=self.model_params['encoder_embedding_size'], use_attention=(attention_type != 'none'), ) # For bidirectional RNN, the num of units doubles after encodeing if (self.encoder_type == 'rnn' and self.encoder_params['use_bidirectional_encoder']): encoder_num_units *= 2 if attention_type == 'none': decoder_initial_hidden_state = model.FC( final_encoder_hidden_state, 'decoder_initial_hidden_state', encoder_num_units, decoder_num_units, axis=2, ) decoder_initial_cell_state = model.FC( final_encoder_cell_state, 'decoder_initial_cell_state', encoder_num_units, decoder_num_units, axis=2, ) else: decoder_initial_hidden_state = model.param_init_net.ConstantFill( [], 'decoder_initial_hidden_state', shape=[decoder_num_units], value=0.0, ) decoder_initial_cell_state = model.param_init_net.ConstantFill( [], 'decoder_initial_cell_state', shape=[decoder_num_units], value=0.0, ) initial_attention_weighted_encoder_context = ( model.param_init_net.ConstantFill( [], 'initial_attention_weighted_encoder_context', shape=[encoder_num_units], value=0.0, )) sqrt3 = math.sqrt(3) decoder_embeddings = model.AddParam( name='decoder_embeddings', init=('UniformFill', dict( shape=[ self.target_vocab_size, self.model_params['decoder_embedding_size'], ], min=-sqrt3, max=sqrt3, )), ) embedded_decoder_inputs = model.net.Gather( [decoder_embeddings, self.decoder_inputs], ['embedded_decoder_inputs'], ) # seq_len x batch_size x decoder_embedding_size with core.NameScope('', reset=True): if attention_type == 'none': decoder_outputs, _, _, _ = recurrent.LSTM( model=model, input_blob=embedded_decoder_inputs, seq_lengths=self.decoder_lengths, initial_states=( decoder_initial_hidden_state, decoder_initial_cell_state, ), dim_in=self.model_params['decoder_embedding_size'], dim_out=decoder_num_units, scope='decoder', outputs_with_grads=[0], ) decoder_output_size = decoder_num_units else: (decoder_outputs, _, _, _, attention_weighted_encoder_contexts, _) = recurrent.LSTMWithAttention( model=model, decoder_inputs=embedded_decoder_inputs, decoder_input_lengths=self.decoder_lengths, initial_decoder_hidden_state=decoder_initial_hidden_state, initial_decoder_cell_state=decoder_initial_cell_state, initial_attention_weighted_encoder_context=( initial_attention_weighted_encoder_context), encoder_output_dim=encoder_num_units, encoder_outputs=encoder_outputs, decoder_input_dim=self. model_params['decoder_embedding_size'], decoder_state_dim=decoder_num_units, scope='decoder', outputs_with_grads=[0, 4], ) decoder_outputs, _ = model.net.Concat( [decoder_outputs, attention_weighted_encoder_contexts], [ 'states_and_context_combination', '_states_and_context_combination_concat_dims', ], axis=2, ) decoder_output_size = decoder_num_units + encoder_num_units # we do softmax over the whole sequence # (max_length in the batch * batch_size) x decoder embedding size # -1 because we don't know max_length yet decoder_outputs_flattened, _ = model.net.Reshape( [decoder_outputs], [ 'decoder_outputs_flattened', 'decoder_outputs_and_contexts_combination_old_shape', ], shape=[-1, decoder_output_size], ) output_logits = self.output_projection( model=model, decoder_outputs=decoder_outputs_flattened, decoder_output_size=decoder_output_size, target_vocab_size=self.target_vocab_size, decoder_softmax_size=self.model_params['decoder_softmax_size'], ) targets, _ = model.net.Reshape( [self.targets], ['targets', 'targets_old_shape'], shape=[-1], ) target_weights, _ = model.net.Reshape( [self.target_weights], ['target_weights', 'target_weights_old_shape'], shape=[-1], ) output_probs, loss_per_word = model.net.SoftmaxWithLoss( [output_logits, targets, target_weights], ['OutputProbs', 'loss_per_word'], ) num_words = model.net.ReduceFrontSum( target_weights, 'num_words', ) self.total_loss_scalar = model.net.Mul( [loss_per_word, num_words], 'total_loss_scalar', ) self.forward_net = model.net.Clone(name=model.net.Name() + '_forward_only', ) # print loss only in the forward net which evaluates loss after every # epoch self.forward_net.Print([self.total_loss_scalar], []) # Note: average over batch. # It is tricky because of two problems: # 1. ReduceFrontSum from 1-D tensor returns 0-D tensor # 2. If you want to multiply 0-D by 1-D tensor # (by scalar batch_size_inverse_tensor), # you need to use broadcasting. But gradient propogation # is broken for op with broadcasting. # total_loss_scalar, _ = model.net.Reshape( # [total_loss_scalar], # [total_loss_scalar, 'total_loss_scalar_old_shape'], # shape=[1], # ) batch_size_inverse_tensor = (model.param_init_net.ConstantFill( [], 'batch_size_tensor', shape=[], value=1.0 / self.batch_size, )) total_loss_scalar_average = model.net.Mul( [self.total_loss_scalar, batch_size_inverse_tensor], ['total_loss_scalar_average'], ) model.AddGradientOperators([ total_loss_scalar_average, ]) ONE = model.param_init_net.ConstantFill( [], 'ONE', shape=[1], value=1.0, ) logger.info('All trainable variables: ') for param in model.params: param_grad = model.param_to_grad[param] if param in model.param_to_grad: if isinstance(param_grad, core.GradientSlice): param_grad_values = param_grad.values param_grad_values = model.net.Clip( [param_grad_values], [param_grad_values], min=0.0, max=float(self.model_params['max_grad_value']), ) model.net.ScatterWeightedSum( [ param, ONE, param_grad.indices, param_grad_values, model.net.Negative( [self.learning_rate], 'negative_learning_rate', ), ], param, ) else: param_grad = model.net.Clip( [param_grad], [param_grad], min=0.0, max=float(self.model_params['max_grad_value']), ) model.net.WeightedSum( [ param, ONE, param_grad, model.net.Negative( [self.learning_rate], 'negative_learning_rate', ), ], param, ) self.model = model
def test_lstm(self, t, n, d): model = ModelHelperBase(name='external') input_blob, seq_lengths, hidden_init, cell_init = ( model.net.AddExternalInputs('input_blob', 'seq_lengths', 'hidden_init', 'cell_init')) recurrent.LSTM(model, input_blob, seq_lengths, (hidden_init, cell_init), d, d, scope="external/recurrent") op = model.net._net.op[-1] def extract_param_name(model, param_substr): result = [] for p in model.params: if param_substr in str(p): result.append(str(p)) assert len(result) == 1 return result[0] gates = { gate: extract_param_name(model, gate) for gate in ["gates_t_b", "gates_t_w"] } workspace.RunNetOnce(model.param_init_net) def reference(input, hidden_input, cell_input, gates_w, gates_b, seq_lengths): T = input.shape[0] N = input.shape[1] G = input.shape[2] D = hidden_input.shape[2] hidden = np.zeros(shape=(T + 1, N, D)) cell = np.zeros(shape=(T + 1, N, D)) assert hidden.shape[0] == T + 1 assert cell.shape[0] == T + 1 assert hidden.shape[1] == N assert cell.shape[1] == N cell[0, :, :] = cell_input hidden[0, :, :] = hidden_input for t in range(T): timestep = np.asarray([t]).astype(np.int32) input_t = input[t].reshape(1, N, G) hidden_t_prev = hidden[t].reshape(1, N, D) cell_t_prev = cell[t].reshape(1, N, D) gates = np.dot(hidden_t_prev, gates_w.T) + gates_b gates = gates + input_t hidden_t, cell_t = lstm_unit(cell_t_prev, gates, seq_lengths, timestep) hidden[t + 1] = hidden_t cell[t + 1] = cell_t return (hidden[1:], hidden[-1].reshape(1, N, D), cell[1:], cell[-1].reshape(1, N, D)) input_blob = op.input[0] workspace.FeedBlob(str(input_blob), np.random.randn(t, n, d * 4).astype(np.float32)) workspace.FeedBlob("hidden_init", np.random.randn(1, n, d).astype(np.float32)) workspace.FeedBlob("cell_init", np.random.randn(1, n, d).astype(np.float32)) workspace.FeedBlob( "seq_lengths", np.random.randint(0, t, size=(n, )).astype(np.int32)) self.assertReferenceChecks( hu.cpu_do, op, [ workspace.FetchBlob(name) for name in [ input_blob, "hidden_init", "cell_init", gates["gates_t_w"], gates["gates_t_b"], "seq_lengths" ] ], reference, ) # Checking for input, gates_t_w and gates_t_b gradients for param in [0, 3, 4]: self.assertGradientChecks( hu.cpu_do, op, [ workspace.FetchBlob(name) for name in [ input_blob, "hidden_init", "cell_init", gates["gates_t_w"], gates["gates_t_b"], "seq_lengths" ] ], param, [0], threshold=0.01, )
def model_build_fun(self, model, forward_only=False, loss_scale=None): encoder_inputs = model.net.AddExternalInput( workspace.GetNameScope() + 'encoder_inputs', ) encoder_lengths = model.net.AddExternalInput( workspace.GetNameScope() + 'encoder_lengths', ) decoder_inputs = model.net.AddExternalInput( workspace.GetNameScope() + 'decoder_inputs', ) decoder_lengths = model.net.AddExternalInput( workspace.GetNameScope() + 'decoder_lengths', ) targets = model.net.AddExternalInput( workspace.GetNameScope() + 'targets', ) target_weights = model.net.AddExternalInput( workspace.GetNameScope() + 'target_weights', ) attention_type = self.model_params['attention'] assert attention_type in ['none', 'regular'] ( encoder_outputs, weighted_encoder_outputs, final_encoder_hidden_state, final_encoder_cell_state, encoder_output_dim, ) = self._build_embedding_encoder( model=model, inputs=encoder_inputs, input_lengths=encoder_lengths, vocab_size=self.source_vocab_size, embeddings=self.encoder_embeddings, embedding_size=self.model_params['encoder_embedding_size'], use_attention=(attention_type != 'none'), num_gpus=self.num_gpus, forward_only=forward_only, ) assert len(self.model_params['decoder_layer_configs']) == 1 decoder_num_units = ( self.model_params['decoder_layer_configs'][0]['num_units']) if attention_type == 'none': decoder_initial_hidden_state = model.FC( final_encoder_hidden_state, 'decoder_initial_hidden_state', encoder_output_dim, decoder_num_units, axis=2, ) decoder_initial_cell_state = model.FC( final_encoder_cell_state, 'decoder_initial_cell_state', encoder_output_dim, decoder_num_units, axis=2, ) else: decoder_initial_hidden_state = model.param_init_net.ConstantFill( [], 'decoder_initial_hidden_state', shape=[decoder_num_units], value=0.0, ) decoder_initial_cell_state = model.param_init_net.ConstantFill( [], 'decoder_initial_cell_state', shape=[decoder_num_units], value=0.0, ) initial_attention_weighted_encoder_context = ( model.param_init_net.ConstantFill( [], 'initial_attention_weighted_encoder_context', shape=[encoder_output_dim], value=0.0, )) if self.num_gpus == 0: embedded_decoder_inputs = model.net.Gather( [self.decoder_embeddings, decoder_inputs], ['embedded_decoder_inputs'], ) else: with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)): embedded_decoder_inputs_cpu = model.net.Gather( [self.decoder_embeddings, decoder_inputs], ['embedded_decoder_inputs_cpu'], ) embedded_decoder_inputs = model.CopyCPUToGPU( embedded_decoder_inputs_cpu, 'embedded_decoder_inputs', ) # seq_len x batch_size x decoder_embedding_size if attention_type == 'none': decoder_outputs, _, _, _ = recurrent.LSTM( model=model, input_blob=embedded_decoder_inputs, seq_lengths=decoder_lengths, initial_states=( decoder_initial_hidden_state, decoder_initial_cell_state, ), dim_in=self.model_params['decoder_embedding_size'], dim_out=decoder_num_units, scope='decoder', outputs_with_grads=[0], ) decoder_output_size = decoder_num_units else: (decoder_outputs, _, _, _, attention_weighted_encoder_contexts, _) = recurrent.LSTMWithAttention( model=model, decoder_inputs=embedded_decoder_inputs, decoder_input_lengths=decoder_lengths, initial_decoder_hidden_state=decoder_initial_hidden_state, initial_decoder_cell_state=decoder_initial_cell_state, initial_attention_weighted_encoder_context=( initial_attention_weighted_encoder_context), encoder_output_dim=encoder_output_dim, encoder_outputs=encoder_outputs, decoder_input_dim=self.model_params['decoder_embedding_size'], decoder_state_dim=decoder_num_units, scope='decoder', outputs_with_grads=[0, 4], ) decoder_outputs, _ = model.net.Concat( [decoder_outputs, attention_weighted_encoder_contexts], [ 'states_and_context_combination', '_states_and_context_combination_concat_dims', ], axis=2, ) decoder_output_size = decoder_num_units + encoder_output_dim # we do softmax over the whole sequence # (max_length in the batch * batch_size) x decoder embedding size # -1 because we don't know max_length yet decoder_outputs_flattened, _ = model.net.Reshape( [decoder_outputs], [ 'decoder_outputs_flattened', 'decoder_outputs_and_contexts_combination_old_shape', ], shape=[-1, decoder_output_size], ) output_logits = self.output_projection( model=model, decoder_outputs=decoder_outputs_flattened, decoder_output_size=decoder_output_size, target_vocab_size=self.target_vocab_size, decoder_softmax_size=self.model_params['decoder_softmax_size'], ) targets, _ = model.net.Reshape( [targets], ['targets', 'targets_old_shape'], shape=[-1], ) target_weights, _ = model.net.Reshape( [target_weights], ['target_weights', 'target_weights_old_shape'], shape=[-1], ) output_probs = model.net.Softmax( [output_logits], ['output_probs'], engine=('CUDNN' if self.num_gpus > 0 else None), ) label_cross_entropy = model.net.LabelCrossEntropy( [output_probs, targets], ['label_cross_entropy'], ) weighted_label_cross_entropy = model.net.Mul( [label_cross_entropy, target_weights], 'weighted_label_cross_entropy', ) total_loss_scalar = model.net.SumElements( [weighted_label_cross_entropy], 'total_loss_scalar', ) total_loss_scalar_weighted = model.net.Scale( [total_loss_scalar], 'total_loss_scalar_weighted', scale=1.0 / self.batch_size, ) return [total_loss_scalar_weighted]