Ejemplo n.º 1
0
    def _build_model(
        self,
        init_params,
    ):
        model = seq2seq_util.ModelHelper(init_params=init_params, )

        self.encoder_inputs = model.net.AddExternalInput('encoder_inputs')
        self.encoder_lengths = model.net.AddExternalInput('encoder_lengths')
        self.decoder_inputs = model.net.AddExternalInput('decoder_inputs')
        self.decoder_lengths = model.net.AddExternalInput('decoder_lengths')
        self.targets = model.net.AddExternalInput('targets')
        self.target_weights = model.net.AddExternalInput('target_weights')

        optimizer_params = self.model_params['optimizer_params']
        attention_type = self.model_params['attention']
        assert attention_type in ['none', 'regular']

        self.learning_rate = model.AddParam(
            name='learning_rate',
            init_value=float(optimizer_params['learning_rate']),
            trainable=False,
        )
        self.global_step = model.AddParam(
            name='global_step',
            init_value=0,
            trainable=False,
        )
        self.start_time = model.AddParam(
            name='start_time',
            init_value=time.time(),
            trainable=False,
        )

        assert self.num_gpus < 2
        assert len(self.encoder_params['encoder_layer_configs']) == 1
        assert len(self.model_params['decoder_layer_configs']) == 1

        encoder_num_units = (
            self.encoder_params['encoder_layer_configs'][0]['num_units'])
        decoder_num_units = (
            self.model_params['decoder_layer_configs'][0]['num_units'])

        (
            encoder_outputs,
            final_encoder_hidden_state,
            final_encoder_cell_state,
        ) = self._embedding_encoder(
            model=model,
            encoder_type=self.encoder_type,
            encoder_params=self.encoder_params,
            inputs=self.encoder_inputs,
            input_lengths=self.encoder_lengths,
            vocab_size=self.source_vocab_size,
            embedding_size=self.model_params['encoder_embedding_size'],
            use_attention=(attention_type != 'none'),
        )

        # For bidirectional RNN, the num of units doubles after encodeing
        if (self.encoder_type == 'rnn'
                and self.encoder_params['use_bidirectional_encoder']):
            encoder_num_units *= 2

        if attention_type == 'none':
            decoder_initial_hidden_state = model.FC(
                final_encoder_hidden_state,
                'decoder_initial_hidden_state',
                encoder_num_units,
                decoder_num_units,
                axis=2,
            )
            decoder_initial_cell_state = model.FC(
                final_encoder_cell_state,
                'decoder_initial_cell_state',
                encoder_num_units,
                decoder_num_units,
                axis=2,
            )
        else:
            decoder_initial_hidden_state = model.param_init_net.ConstantFill(
                [],
                'decoder_initial_hidden_state',
                shape=[decoder_num_units],
                value=0.0,
            )
            decoder_initial_cell_state = model.param_init_net.ConstantFill(
                [],
                'decoder_initial_cell_state',
                shape=[decoder_num_units],
                value=0.0,
            )
            initial_attention_weighted_encoder_context = (
                model.param_init_net.ConstantFill(
                    [],
                    'initial_attention_weighted_encoder_context',
                    shape=[encoder_num_units],
                    value=0.0,
                ))

        sqrt3 = math.sqrt(3)
        decoder_embeddings = model.AddParam(
            name='decoder_embeddings',
            init=('UniformFill',
                  dict(
                      shape=[
                          self.target_vocab_size,
                          self.model_params['decoder_embedding_size'],
                      ],
                      min=-sqrt3,
                      max=sqrt3,
                  )),
        )

        embedded_decoder_inputs = model.net.Gather(
            [decoder_embeddings, self.decoder_inputs],
            ['embedded_decoder_inputs'],
        )
        # seq_len x batch_size x decoder_embedding_size
        with core.NameScope('', reset=True):
            if attention_type == 'none':
                decoder_outputs, _, _, _ = recurrent.LSTM(
                    model=model,
                    input_blob=embedded_decoder_inputs,
                    seq_lengths=self.decoder_lengths,
                    initial_states=(
                        decoder_initial_hidden_state,
                        decoder_initial_cell_state,
                    ),
                    dim_in=self.model_params['decoder_embedding_size'],
                    dim_out=decoder_num_units,
                    scope='decoder',
                    outputs_with_grads=[0],
                )
                decoder_output_size = decoder_num_units
            else:
                (decoder_outputs, _, _, _, attention_weighted_encoder_contexts,
                 _) = recurrent.LSTMWithAttention(
                     model=model,
                     decoder_inputs=embedded_decoder_inputs,
                     decoder_input_lengths=self.decoder_lengths,
                     initial_decoder_hidden_state=decoder_initial_hidden_state,
                     initial_decoder_cell_state=decoder_initial_cell_state,
                     initial_attention_weighted_encoder_context=(
                         initial_attention_weighted_encoder_context),
                     encoder_output_dim=encoder_num_units,
                     encoder_outputs=encoder_outputs,
                     decoder_input_dim=self.
                     model_params['decoder_embedding_size'],
                     decoder_state_dim=decoder_num_units,
                     scope='decoder',
                     outputs_with_grads=[0, 4],
                 )
                decoder_outputs, _ = model.net.Concat(
                    [decoder_outputs, attention_weighted_encoder_contexts],
                    [
                        'states_and_context_combination',
                        '_states_and_context_combination_concat_dims',
                    ],
                    axis=2,
                )
                decoder_output_size = decoder_num_units + encoder_num_units

        # we do softmax over the whole sequence
        # (max_length in the batch * batch_size) x decoder embedding size
        # -1 because we don't know max_length yet
        decoder_outputs_flattened, _ = model.net.Reshape(
            [decoder_outputs],
            [
                'decoder_outputs_flattened',
                'decoder_outputs_and_contexts_combination_old_shape',
            ],
            shape=[-1, decoder_output_size],
        )
        output_logits = self.output_projection(
            model=model,
            decoder_outputs=decoder_outputs_flattened,
            decoder_output_size=decoder_output_size,
            target_vocab_size=self.target_vocab_size,
            decoder_softmax_size=self.model_params['decoder_softmax_size'],
        )
        targets, _ = model.net.Reshape(
            [self.targets],
            ['targets', 'targets_old_shape'],
            shape=[-1],
        )
        target_weights, _ = model.net.Reshape(
            [self.target_weights],
            ['target_weights', 'target_weights_old_shape'],
            shape=[-1],
        )

        output_probs, loss_per_word = model.net.SoftmaxWithLoss(
            [output_logits, targets, target_weights],
            ['OutputProbs', 'loss_per_word'],
        )

        num_words = model.net.ReduceFrontSum(
            target_weights,
            'num_words',
        )
        self.total_loss_scalar = model.net.Mul(
            [loss_per_word, num_words],
            'total_loss_scalar',
        )
        self.forward_net = model.net.Clone(name=model.net.Name() +
                                           '_forward_only', )
        # print loss only in the forward net which evaluates loss after every
        # epoch
        self.forward_net.Print([self.total_loss_scalar], [])

        # Note: average over batch.
        # It is tricky because of two problems:
        # 1. ReduceFrontSum from 1-D tensor returns 0-D tensor
        # 2. If you want to multiply 0-D by 1-D tensor
        # (by scalar batch_size_inverse_tensor),
        # you need to use broadcasting. But gradient propogation
        # is broken for op with broadcasting.
        # total_loss_scalar, _ = model.net.Reshape(
        #     [total_loss_scalar],
        #     [total_loss_scalar, 'total_loss_scalar_old_shape'],
        #     shape=[1],
        # )
        batch_size_inverse_tensor = (model.param_init_net.ConstantFill(
            [],
            'batch_size_tensor',
            shape=[],
            value=1.0 / self.batch_size,
        ))
        total_loss_scalar_average = model.net.Mul(
            [self.total_loss_scalar, batch_size_inverse_tensor],
            ['total_loss_scalar_average'],
        )

        model.AddGradientOperators([
            total_loss_scalar_average,
        ])
        ONE = model.param_init_net.ConstantFill(
            [],
            'ONE',
            shape=[1],
            value=1.0,
        )
        logger.info('All trainable variables: ')

        for param in model.params:
            param_grad = model.param_to_grad[param]
            if param in model.param_to_grad:
                if isinstance(param_grad, core.GradientSlice):
                    param_grad_values = param_grad.values
                    param_grad_values = model.net.Clip(
                        [param_grad_values],
                        [param_grad_values],
                        min=0.0,
                        max=float(self.model_params['max_grad_value']),
                    )
                    model.net.ScatterWeightedSum(
                        [
                            param,
                            ONE,
                            param_grad.indices,
                            param_grad_values,
                            model.net.Negative(
                                [self.learning_rate],
                                'negative_learning_rate',
                            ),
                        ],
                        param,
                    )
                else:
                    param_grad = model.net.Clip(
                        [param_grad],
                        [param_grad],
                        min=0.0,
                        max=float(self.model_params['max_grad_value']),
                    )
                    model.net.WeightedSum(
                        [
                            param,
                            ONE,
                            param_grad,
                            model.net.Negative(
                                [self.learning_rate],
                                'negative_learning_rate',
                            ),
                        ],
                        param,
                    )
        self.model = model
Ejemplo n.º 2
0
    def test_lstm_with_recurrent_attention(
        self,
        encoder_output_length,
        encoder_output_dim,
        decoder_input_length,
        decoder_state_dim,
        batch_size,
        gc,
        dc,
    ):
        with core.DeviceScope(gc):
            model = CNNModelHelper(name="external")
            (
                encoder_outputs,
                decoder_inputs,
                decoder_input_lengths,
                initial_decoder_hidden_state,
                initial_decoder_cell_state,
                initial_attention_weighted_encoder_context,
            ) = model.net.AddExternalInputs(
                "encoder_outputs",
                "decoder_inputs",
                "decoder_input_lengths",
                "initial_decoder_hidden_state",
                "initial_decoder_cell_state",
                "initial_attention_weighted_encoder_context",
            )
            recurrent.LSTMWithAttention(
                model=model,
                decoder_inputs=decoder_inputs,
                decoder_input_lengths=decoder_input_lengths,
                initial_decoder_hidden_state=initial_decoder_hidden_state,
                initial_decoder_cell_state=initial_decoder_cell_state,
                initial_attention_weighted_encoder_context=(
                    initial_attention_weighted_encoder_context),
                encoder_output_dim=encoder_output_dim,
                encoder_outputs=encoder_outputs,
                decoder_input_dim=decoder_state_dim,
                decoder_state_dim=decoder_state_dim,
                scope='external/LSTMWithAttention',
                attention_type=AttentionType.Recurrent)
            op = model.net._net.op[-1]
        workspace.RunNetOnce(model.param_init_net)

        # This is original decoder_inputs after linear layer
        decoder_input_blob = op.input[0]

        workspace.FeedBlob(
            decoder_input_blob,
            np.random.randn(
                decoder_input_length,
                batch_size,
                decoder_state_dim * 4,
            ).astype(np.float32))
        workspace.FeedBlob(
            "external/LSTMWithAttention/encoder_outputs_transposed",
            np.random.randn(
                batch_size,
                encoder_output_dim,
                encoder_output_length,
            ).astype(np.float32),
        )
        workspace.FeedBlob(
            "external/LSTMWithAttention/weighted_encoder_outputs",
            np.random.randn(
                encoder_output_length,
                batch_size,
                encoder_output_dim,
            ).astype(np.float32),
        )
        workspace.FeedBlob(
            decoder_input_lengths,
            np.random.randint(0, decoder_input_length + 1,
                              size=(batch_size, )).astype(np.int32))
        workspace.FeedBlob(
            initial_decoder_hidden_state,
            np.random.randn(1, batch_size,
                            decoder_state_dim).astype(np.float32))
        workspace.FeedBlob(
            initial_decoder_cell_state,
            np.random.randn(1, batch_size,
                            decoder_state_dim).astype(np.float32))
        workspace.FeedBlob(
            initial_attention_weighted_encoder_context,
            np.random.randn(1, batch_size,
                            encoder_output_dim).astype(np.float32))
        inputs = [workspace.FetchBlob(name) for name in op.input]

        self.assertReferenceChecks(
            device_option=gc,
            op=op,
            inputs=inputs,
            reference=lstm_with_recurrent_attention_reference,
            grad_reference=None,
            output_to_grad=None,
            outputs_to_check=range(6),
        )
        gradients_to_check = [
            index for (index, input_name) in enumerate(op.input)
            if input_name != "decoder_input_lengths"
        ]
        for param in gradients_to_check:
            self.assertGradientChecks(
                device_option=gc,
                op=op,
                inputs=inputs,
                outputs_to_check=param,
                outputs_with_grads=[0, 4],
                threshold=0.01,
                stepsize=0.001,
            )
Ejemplo n.º 3
0
    def model_build_fun(self, model, forward_only=False, loss_scale=None):
        encoder_inputs = model.net.AddExternalInput(
            workspace.GetNameScope() + 'encoder_inputs', )
        encoder_lengths = model.net.AddExternalInput(
            workspace.GetNameScope() + 'encoder_lengths', )
        decoder_inputs = model.net.AddExternalInput(
            workspace.GetNameScope() + 'decoder_inputs', )
        decoder_lengths = model.net.AddExternalInput(
            workspace.GetNameScope() + 'decoder_lengths', )
        targets = model.net.AddExternalInput(
            workspace.GetNameScope() + 'targets', )
        target_weights = model.net.AddExternalInput(
            workspace.GetNameScope() + 'target_weights', )
        attention_type = self.model_params['attention']
        assert attention_type in ['none', 'regular']

        (
            encoder_outputs,
            weighted_encoder_outputs,
            final_encoder_hidden_state,
            final_encoder_cell_state,
            encoder_output_dim,
        ) = self._build_embedding_encoder(
            model=model,
            inputs=encoder_inputs,
            input_lengths=encoder_lengths,
            vocab_size=self.source_vocab_size,
            embeddings=self.encoder_embeddings,
            embedding_size=self.model_params['encoder_embedding_size'],
            use_attention=(attention_type != 'none'),
            num_gpus=self.num_gpus,
            forward_only=forward_only,
        )

        assert len(self.model_params['decoder_layer_configs']) == 1
        decoder_num_units = (
            self.model_params['decoder_layer_configs'][0]['num_units'])

        if attention_type == 'none':
            decoder_initial_hidden_state = model.FC(
                final_encoder_hidden_state,
                'decoder_initial_hidden_state',
                encoder_output_dim,
                decoder_num_units,
                axis=2,
            )
            decoder_initial_cell_state = model.FC(
                final_encoder_cell_state,
                'decoder_initial_cell_state',
                encoder_output_dim,
                decoder_num_units,
                axis=2,
            )
        else:
            decoder_initial_hidden_state = model.param_init_net.ConstantFill(
                [],
                'decoder_initial_hidden_state',
                shape=[decoder_num_units],
                value=0.0,
            )
            decoder_initial_cell_state = model.param_init_net.ConstantFill(
                [],
                'decoder_initial_cell_state',
                shape=[decoder_num_units],
                value=0.0,
            )
            initial_attention_weighted_encoder_context = (
                model.param_init_net.ConstantFill(
                    [],
                    'initial_attention_weighted_encoder_context',
                    shape=[encoder_output_dim],
                    value=0.0,
                ))

        if self.num_gpus == 0:
            embedded_decoder_inputs = model.net.Gather(
                [self.decoder_embeddings, decoder_inputs],
                ['embedded_decoder_inputs'],
            )
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                embedded_decoder_inputs_cpu = model.net.Gather(
                    [self.decoder_embeddings, decoder_inputs],
                    ['embedded_decoder_inputs_cpu'],
                )
            embedded_decoder_inputs = model.CopyCPUToGPU(
                embedded_decoder_inputs_cpu,
                'embedded_decoder_inputs',
            )

        # seq_len x batch_size x decoder_embedding_size
        if attention_type == 'none':
            decoder_outputs, _, _, _ = recurrent.LSTM(
                model=model,
                input_blob=embedded_decoder_inputs,
                seq_lengths=decoder_lengths,
                initial_states=(
                    decoder_initial_hidden_state,
                    decoder_initial_cell_state,
                ),
                dim_in=self.model_params['decoder_embedding_size'],
                dim_out=decoder_num_units,
                scope='decoder',
                outputs_with_grads=[0],
            )
            decoder_output_size = decoder_num_units
        else:
            (decoder_outputs, _, _, _, attention_weighted_encoder_contexts,
             _) = recurrent.LSTMWithAttention(
                 model=model,
                 decoder_inputs=embedded_decoder_inputs,
                 decoder_input_lengths=decoder_lengths,
                 initial_decoder_hidden_state=decoder_initial_hidden_state,
                 initial_decoder_cell_state=decoder_initial_cell_state,
                 initial_attention_weighted_encoder_context=(
                     initial_attention_weighted_encoder_context),
                 encoder_output_dim=encoder_output_dim,
                 encoder_outputs=encoder_outputs,
                 decoder_input_dim=self.model_params['decoder_embedding_size'],
                 decoder_state_dim=decoder_num_units,
                 scope='decoder',
                 outputs_with_grads=[0, 4],
             )
            decoder_outputs, _ = model.net.Concat(
                [decoder_outputs, attention_weighted_encoder_contexts],
                [
                    'states_and_context_combination',
                    '_states_and_context_combination_concat_dims',
                ],
                axis=2,
            )
            decoder_output_size = decoder_num_units + encoder_output_dim

        # we do softmax over the whole sequence
        # (max_length in the batch * batch_size) x decoder embedding size
        # -1 because we don't know max_length yet
        decoder_outputs_flattened, _ = model.net.Reshape(
            [decoder_outputs],
            [
                'decoder_outputs_flattened',
                'decoder_outputs_and_contexts_combination_old_shape',
            ],
            shape=[-1, decoder_output_size],
        )
        output_logits = self.output_projection(
            model=model,
            decoder_outputs=decoder_outputs_flattened,
            decoder_output_size=decoder_output_size,
            target_vocab_size=self.target_vocab_size,
            decoder_softmax_size=self.model_params['decoder_softmax_size'],
        )
        targets, _ = model.net.Reshape(
            [targets],
            ['targets', 'targets_old_shape'],
            shape=[-1],
        )
        target_weights, _ = model.net.Reshape(
            [target_weights],
            ['target_weights', 'target_weights_old_shape'],
            shape=[-1],
        )
        output_probs = model.net.Softmax(
            [output_logits],
            ['output_probs'],
            engine=('CUDNN' if self.num_gpus > 0 else None),
        )
        label_cross_entropy = model.net.LabelCrossEntropy(
            [output_probs, targets],
            ['label_cross_entropy'],
        )
        weighted_label_cross_entropy = model.net.Mul(
            [label_cross_entropy, target_weights],
            'weighted_label_cross_entropy',
        )
        total_loss_scalar = model.net.SumElements(
            [weighted_label_cross_entropy],
            'total_loss_scalar',
        )
        total_loss_scalar_weighted = model.net.Scale(
            [total_loss_scalar],
            'total_loss_scalar_weighted',
            scale=1.0 / self.batch_size,
        )
        return [total_loss_scalar_weighted]