예제 #1
0
파일: train.py 프로젝트: zlbing/caffe2
    def model_build_fun(self, model, forward_only=False, loss_scale=None):
        encoder_inputs = model.net.AddExternalInput(
            workspace.GetNameScope() + 'encoder_inputs', )
        encoder_lengths = model.net.AddExternalInput(
            workspace.GetNameScope() + 'encoder_lengths', )
        decoder_inputs = model.net.AddExternalInput(
            workspace.GetNameScope() + 'decoder_inputs', )
        decoder_lengths = model.net.AddExternalInput(
            workspace.GetNameScope() + 'decoder_lengths', )
        targets = model.net.AddExternalInput(
            workspace.GetNameScope() + 'targets', )
        target_weights = model.net.AddExternalInput(
            workspace.GetNameScope() + 'target_weights', )
        attention_type = self.model_params['attention']
        assert attention_type in ['none', 'regular']

        (
            encoder_outputs,
            weighted_encoder_outputs,
            final_encoder_hidden_state,
            final_encoder_cell_state,
            encoder_output_dim,
        ) = seq2seq_util.build_embedding_encoder(
            model=model,
            encoder_params=self.encoder_params,
            inputs=encoder_inputs,
            input_lengths=encoder_lengths,
            vocab_size=self.source_vocab_size,
            embeddings=self.encoder_embeddings,
            embedding_size=self.model_params['encoder_embedding_size'],
            use_attention=(attention_type != 'none'),
            num_gpus=self.num_gpus,
        )

        assert len(self.model_params['decoder_layer_configs']) == 1
        decoder_num_units = (
            self.model_params['decoder_layer_configs'][0]['num_units'])
        initial_states = seq2seq_util.build_initial_rnn_decoder_states(
            model=model,
            encoder_num_units=encoder_output_dim,
            decoder_num_units=decoder_num_units,
            final_encoder_hidden_state=final_encoder_hidden_state,
            final_encoder_cell_state=final_encoder_cell_state,
            use_attention=(attention_type != 'none'),
        )

        if self.num_gpus == 0:
            embedded_decoder_inputs = model.net.Gather(
                [self.decoder_embeddings, decoder_inputs],
                ['embedded_decoder_inputs'],
            )
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                embedded_decoder_inputs_cpu = model.net.Gather(
                    [self.decoder_embeddings, decoder_inputs],
                    ['embedded_decoder_inputs_cpu'],
                )
            embedded_decoder_inputs = model.CopyCPUToGPU(
                embedded_decoder_inputs_cpu,
                'embedded_decoder_inputs',
            )

        # seq_len x batch_size x decoder_embedding_size
        if attention_type == 'none':
            decoder_outputs, _, _, _ = rnn_cell.LSTM(
                model=model,
                input_blob=embedded_decoder_inputs,
                seq_lengths=decoder_lengths,
                initial_states=initial_states,
                dim_in=self.model_params['decoder_embedding_size'],
                dim_out=decoder_num_units,
                scope='decoder',
                outputs_with_grads=[0],
            )
            decoder_output_size = decoder_num_units
        else:
            (decoder_outputs, _, _, _, attention_weighted_encoder_contexts,
             _) = rnn_cell.LSTMWithAttention(
                 model=model,
                 decoder_inputs=embedded_decoder_inputs,
                 decoder_input_lengths=decoder_lengths,
                 initial_decoder_hidden_state=initial_states[0],
                 initial_decoder_cell_state=initial_states[1],
                 initial_attention_weighted_encoder_context=initial_states[2],
                 encoder_output_dim=encoder_output_dim,
                 encoder_outputs=encoder_outputs,
                 decoder_input_dim=self.model_params['decoder_embedding_size'],
                 decoder_state_dim=decoder_num_units,
                 scope='decoder',
                 outputs_with_grads=[0, 4],
             )
            decoder_outputs, _ = model.net.Concat(
                [decoder_outputs, attention_weighted_encoder_contexts],
                [
                    'states_and_context_combination',
                    '_states_and_context_combination_concat_dims',
                ],
                axis=2,
            )
            decoder_output_size = decoder_num_units + encoder_output_dim

        # we do softmax over the whole sequence
        # (max_length in the batch * batch_size) x decoder embedding size
        # -1 because we don't know max_length yet
        decoder_outputs_flattened, _ = model.net.Reshape(
            [decoder_outputs],
            [
                'decoder_outputs_flattened',
                'decoder_outputs_and_contexts_combination_old_shape',
            ],
            shape=[-1, decoder_output_size],
        )
        output_logits = seq2seq_util.output_projection(
            model=model,
            decoder_outputs=decoder_outputs_flattened,
            decoder_output_size=decoder_output_size,
            target_vocab_size=self.target_vocab_size,
            decoder_softmax_size=self.model_params['decoder_softmax_size'],
        )
        targets, _ = model.net.Reshape(
            [targets],
            ['targets', 'targets_old_shape'],
            shape=[-1],
        )
        target_weights, _ = model.net.Reshape(
            [target_weights],
            ['target_weights', 'target_weights_old_shape'],
            shape=[-1],
        )
        output_probs = model.net.Softmax(
            [output_logits],
            ['output_probs'],
            engine=('CUDNN' if self.num_gpus > 0 else None),
        )
        label_cross_entropy = model.net.LabelCrossEntropy(
            [output_probs, targets],
            ['label_cross_entropy'],
        )
        weighted_label_cross_entropy = model.net.Mul(
            [label_cross_entropy, target_weights],
            'weighted_label_cross_entropy',
        )
        total_loss_scalar = model.net.SumElements(
            [weighted_label_cross_entropy],
            'total_loss_scalar',
        )
        total_loss_scalar_weighted = model.net.Scale(
            [total_loss_scalar],
            'total_loss_scalar_weighted',
            scale=1.0 / self.batch_size,
        )
        return [total_loss_scalar_weighted]
예제 #2
0
    def test_lstm_with_attention_equal_simplenet(self, T, forward_only, gc,
                                                 dc):
        self.Tseq = [T, T // 2, T // 2 + T // 4, T, T // 2 + 1]
        workspace.ResetWorkspace()
        with core.DeviceScope(gc):
            print("Run with device: {}, forward only: {}".format(
                gc, forward_only))

            workspace.FeedBlob("seq_lengths",
                               np.array([T] * self.batch_size, dtype=np.int32))
            workspace.FeedBlob(
                "target",
                np.random.rand(T, self.batch_size,
                               self.hidden_dim).astype(np.float32))
            workspace.FeedBlob(
                "hidden_init",
                np.zeros([1, self.batch_size, self.hidden_dim],
                         dtype=np.float32))
            workspace.FeedBlob(
                "cell_init",
                np.zeros([1, self.batch_size, self.hidden_dim],
                         dtype=np.float32))

            model = model_helper.ModelHelper(name="lstm")
            model.net.AddExternalInputs(["input"])

            init_blobs = []
            hidden_init, cell_init, encoder_outputs = model.net.AddExternalInputs(
                "hidden_init", "cell_init", "encoder_outputs")

            awec_init = model.net.AddExternalInputs([
                'initial_attention_weighted_encoder_context',
            ])
            init_blobs.extend([hidden_init, cell_init])

            workspace.FeedBlob(
                awec_init,
                np.random.rand(1, self.batch_size,
                               self.encoder_dim).astype(np.float32),
            )
            workspace.FeedBlob(
                encoder_outputs,
                np.random.rand(1, self.batch_size,
                               self.encoder_dim).astype(np.float32),
            )

            outputs = rnn_cell.LSTMWithAttention(
                model=model,
                decoder_inputs="input",
                decoder_input_lengths="seq_lengths",
                initial_decoder_hidden_state=hidden_init,
                initial_decoder_cell_state=cell_init,
                initial_attention_weighted_encoder_context=awec_init,
                encoder_output_dim=self.encoder_dim,
                encoder_outputs=encoder_outputs,
                encoder_lengths=None,
                decoder_input_dim=self.input_dim,
                decoder_state_dim=self.hidden_dim,
                scope="",
                attention_type=AttentionType.Recurrent,
                forward_only=forward_only,
                outputs_with_grads=[0],
            )
            output = outputs[0]

            print(outputs)
            loss = model.AveragedLoss(
                model.SquaredL2Distance([output, "target"], "dist"), "loss")
            # Add gradient ops
            if not forward_only:
                model.AddGradientOperators([loss])

            # init
            for init_blob in init_blobs:
                workspace.FeedBlob(
                    init_blob,
                    np.zeros([1, self.batch_size, self.hidden_dim],
                             dtype=np.float32))

            self._compare(model, forward_only)