def __init__(self,
                 encoder_num_units,
                 decoder_num_units,
                 attention_type,
                 attention_dim,
                 use_cuda,
                 sharpening_factor=1,
                 sigmoid_smoothing=False,
                 out_channels=10,
                 kernel_size=201,
                 num_heads=1):

        super(AttentionMechanism, self).__init__()

        self.attention_type = attention_type
        self.attention_dim = attention_dim
        self.use_cuda = use_cuda
        self.sharpening_factor = sharpening_factor
        self.sigmoid_smoothing = sigmoid_smoothing
        self.num_heads = num_heads

        # Multi-head attention
        if num_heads > 1:
            setattr(
                self, 'W_mha',
                LinearND(encoder_num_units * num_heads,
                         encoder_num_units,
                         use_cuda=use_cuda))

        with self.init_scope():
            for h in range(num_heads):
                if self.attention_type == 'content':
                    setattr(
                        self, 'W_enc_head' + str(h),
                        LinearND(encoder_num_units,
                                 attention_dim,
                                 bias=True,
                                 use_cuda=use_cuda))
                    setattr(
                        self, 'W_dec_head' + str(h),
                        LinearND(decoder_num_units,
                                 attention_dim,
                                 bias=False,
                                 use_cuda=use_cuda))
                    setattr(
                        self, 'V_head' + str(h),
                        LinearND(attention_dim,
                                 1,
                                 bias=False,
                                 use_cuda=use_cuda))

                elif self.attention_type == 'location':
                    assert kernel_size % 2 == 1

                    setattr(
                        self, 'W_enc_head' + str(h),
                        LinearND(encoder_num_units,
                                 attention_dim,
                                 bias=True,
                                 use_cuda=use_cuda))
                    setattr(
                        self, 'W_dec_head' + str(h),
                        LinearND(decoder_num_units,
                                 attention_dim,
                                 bias=False,
                                 use_cuda=use_cuda))
                    setattr(
                        self, 'W_conv_head' + str(h),
                        LinearND(out_channels,
                                 attention_dim,
                                 bias=False,
                                 use_cuda=use_cuda))

                    # setattr(self, 'conv_head' + str(h),
                    #         L.ConvolutionND(ndim=1,
                    #                         in_channels=1,
                    #                         out_channels=out_channels,
                    #                         ksize=kernel_size,
                    #                         stride=1,
                    #                         pad=kernel_size // 2,
                    #                         nobias=True,
                    #                         initialW=None,
                    #                         initial_bias=None))
                    setattr(
                        self, 'conv_head' + str(h),
                        L.Convolution2D(in_channels=1,
                                        out_channels=out_channels,
                                        ksize=(1, kernel_size),
                                        stride=1,
                                        pad=(0, kernel_size // 2),
                                        nobias=True,
                                        initialW=None,
                                        initial_bias=None))
                    setattr(
                        self, 'V_head' + str(h),
                        LinearND(attention_dim,
                                 1,
                                 bias=False,
                                 use_cuda=use_cuda))

                elif self.attention_type == 'dot_product':
                    setattr(
                        self, 'W_enc_head' + str(h),
                        LinearND(encoder_num_units,
                                 decoder_num_units,
                                 bias=False,
                                 use_cuda=use_cuda))

                elif self.attention_type == 'rnn_attention':
                    raise NotImplementedError

                elif self.attention_type == 'coverage':
                    raise NotImplementedError

                else:
                    raise TypeError(
                        "attention_type should be one of [%s], you provided %s."
                        % (", ".join(ATTENTION_TYPE), attention_type))

            if use_cuda:
                for c in self.children():
                    c.to_gpu()
Esempio n. 2
0
    def __init__(self,
                 input_size,
                 rnn_type,
                 bidirectional,
                 num_units,
                 num_proj,
                 num_layers,
                 dropout_input,
                 dropout_hidden,
                 subsample_list=[],
                 subsample_type='drop',
                 use_cuda=False,
                 merge_bidirectional=False,
                 num_stack=1,
                 splice=1,
                 input_channel=1,
                 conv_channels=[],
                 conv_kernel_sizes=[],
                 conv_strides=[],
                 poolings=[],
                 activation='relu',
                 batch_norm=False,
                 residual=False,
                 dense_residual=False,
                 num_layers_sub=0):

        super(RNNEncoder, self).__init__()

        if len(subsample_list) > 0 and len(subsample_list) != num_layers:
            raise ValueError(
                'subsample_list must be the same size as num_layers.')
        if subsample_type not in ['drop', 'concat']:
            raise TypeError('subsample_type must be "drop" or "concat".')
        if num_layers_sub < 0 or (num_layers_sub > 1
                                  and num_layers < num_layers_sub):
            raise ValueError('Set num_layers_sub between 1 to num_layers.')

        self.rnn_type = rnn_type
        self.bidirectional = bidirectional
        self.num_directions = 2 if bidirectional else 1
        self.num_units = num_units
        self.num_proj = num_proj if num_proj is not None else 0
        self.num_layers = num_layers
        self.dropout_input = dropout_input
        self.dropout_hidden = dropout_hidden
        self.merge_bidirectional = merge_bidirectional
        self.use_cuda = use_cuda

        # TODO: self.clip_activation = clip_activation

        # Setting for hierarchical encoder
        self.num_layers_sub = num_layers_sub

        # Setting for subsampling
        if len(subsample_list) == 0:
            self.subsample_list = [False] * num_layers
        else:
            self.subsample_list = subsample_list
        self.subsample_type = subsample_type
        # This implementation is bases on
        # https://arxiv.org/abs/1508.01211
        #     Chan, William, et al. "Listen, attend and spell."
        #         arXiv preprint arXiv:1508.01211 (2015).

        # Setting for residual connection
        assert not (residual and dense_residual)
        self.residual = residual
        self.dense_residual = dense_residual
        subsample_last_layer = 0
        for l_reverse, is_subsample in enumerate(subsample_list[::-1]):
            if is_subsample:
                subsample_last_layer = num_layers - l_reverse
                break
        self.residual_start_layer = subsample_last_layer + 1
        # NOTE: residual connection starts from the last subsampling layer

        with self.init_scope():
            # Setting for CNNs before RNNs# Setting for CNNs before RNNs
            if len(conv_channels) > 0 and len(conv_channels) == len(
                    conv_kernel_sizes) and len(conv_kernel_sizes) == len(
                        conv_strides):
                assert num_stack == 1 and splice == 1
                self.conv = CNNEncoder(input_size,
                                       input_channel=input_channel,
                                       conv_channels=conv_channels,
                                       conv_kernel_sizes=conv_kernel_sizes,
                                       conv_strides=conv_strides,
                                       poolings=poolings,
                                       dropout_input=0,
                                       dropout_hidden=dropout_hidden,
                                       activation=activation,
                                       use_cuda=use_cuda,
                                       batch_norm=batch_norm)
                input_size = self.conv.output_size
            else:
                input_size = input_size * splice * num_stack
                self.conv = None

            self.rnns = []
            self.projections = []
            for l in range(num_layers):
                if l == 0:
                    encoder_input_size = input_size
                elif self.num_proj > 0:
                    encoder_input_size = num_proj
                    if subsample_type == 'concat' and l > 0 and self.subsample_list[
                            l - 1]:
                        encoder_input_size *= 2
                else:
                    encoder_input_size = num_units * self.num_directions
                    if subsample_type == 'concat' and l > 0 and self.subsample_list[
                            l - 1]:
                        encoder_input_size *= 2

                if rnn_type == 'lstm':
                    if bidirectional:
                        rnn_i = L.NStepBiLSTM(n_layers=1,
                                              in_size=encoder_input_size,
                                              out_size=num_units,
                                              dropout=0)
                    else:
                        rnn_i = L.NStepLSTM(n_layers=1,
                                            in_size=encoder_input_size,
                                            out_size=num_units,
                                            dropout=0)

                elif rnn_type == 'gru':
                    if bidirectional:
                        rnn_i = L.NStepBiGRU(n_layers=1,
                                             in_size=encoder_input_size,
                                             out_size=num_units,
                                             dropout=0)
                    else:
                        rnn_i = L.NStepGRU(n_layers=1,
                                           in_size=encoder_input_size,
                                           out_size=num_units,
                                           dropout=0)

                elif rnn_type == 'rnn':
                    if bidirectional:
                        # rnn_i = L.NStepBiRNNReLU(
                        rnn_i = L.NStepBiRNNTanh(n_layers=1,
                                                 in_size=encoder_input_size,
                                                 out_size=num_units,
                                                 dropout=0)
                    else:
                        # rnn_i = L.NStepRNNReLU(
                        rnn_i = L.NStepRNNTanh(n_layers=1,
                                               in_size=encoder_input_size,
                                               out_size=num_units,
                                               dropout=0)
                else:
                    raise ValueError(
                        'rnn_type must be "lstm" or "gru" or "rnn".')

                if use_cuda:
                    rnn_i.to_gpu()
                setattr(self, rnn_type + '_l' + str(l), rnn_i)

                if l != self.num_layers - 1 and self.num_proj > 0:
                    proj_i = LinearND(num_units * self.num_directions,
                                      num_proj,
                                      dropout=dropout_hidden,
                                      use_cuda=use_cuda)

                    if use_cuda:
                        proj_i.to_gpu()
                    setattr(self, 'proj_l' + str(l), proj_i)
Esempio n. 3
0
    def __init__(self,
                 input_size,
                 encoder_type,
                 encoder_bidirectional,
                 encoder_num_units,
                 encoder_num_proj,
                 encoder_num_layers,
                 fc_list,
                 dropout_input,
                 dropout_encoder,
                 num_classes,
                 parameter_init_distribution='uniform',
                 parameter_init=0.1,
                 recurrent_weight_orthogonal=False,
                 init_forget_gate_bias_with_one=True,
                 subsample_list=[],
                 subsample_type='drop',
                 logits_temperature=1,
                 num_stack=1,
                 splice=1,
                 input_channel=1,
                 conv_channels=[],
                 conv_kernel_sizes=[],
                 conv_strides=[],
                 poolings=[],
                 activation='relu',
                 batch_norm=False,
                 label_smoothing_prob=0,
                 weight_noise_std=0,
                 encoder_residual=False,
                 encoder_dense_residual=False):

        super(ModelBase, self).__init__()
        self.model_type = 'ctc'

        # Setting for the encoder
        self.input_size = input_size
        self.num_stack = num_stack
        self.encoder_type = encoder_type
        self.encoder_num_units = encoder_num_units
        if encoder_bidirectional:
            self.encoder_num_units *= 2
        self.fc_list = fc_list
        self.subsample_list = subsample_list
        self.batch_norm = batch_norm

        # Setting for CTC
        self.num_classes = num_classes + 1  # Add the blank class
        self.logits_temperature = logits_temperature

        # Setting for regualarization
        self.weight_noise_injection = False
        self.weight_noise_std = float(weight_noise_std)
        self.ls_prob = label_smoothing_prob

        with self.init_scope():
            # Load the encoder
            if encoder_type in ['lstm', 'gru', 'rnn']:
                self.encoder = load(encoder_type=encoder_type)(
                    input_size=input_size,
                    rnn_type=encoder_type,
                    bidirectional=encoder_bidirectional,
                    num_units=encoder_num_units,
                    num_proj=encoder_num_proj,
                    num_layers=encoder_num_layers,
                    dropout_input=dropout_input,
                    dropout_hidden=dropout_encoder,
                    subsample_list=subsample_list,
                    subsample_type=subsample_type,
                    use_cuda=self.use_cuda,
                    merge_bidirectional=False,
                    num_stack=num_stack,
                    splice=splice,
                    input_channel=input_channel,
                    conv_channels=conv_channels,
                    conv_kernel_sizes=conv_kernel_sizes,
                    conv_strides=conv_strides,
                    poolings=poolings,
                    activation=activation,
                    batch_norm=batch_norm,
                    residual=encoder_residual,
                    dense_residual=encoder_dense_residual)
            elif encoder_type == 'cnn':
                assert num_stack == 1 and splice == 1
                self.encoder = load(encoder_type='cnn')(
                    input_size=input_size,
                    input_channel=input_channel,
                    conv_channels=conv_channels,
                    conv_kernel_sizes=conv_kernel_sizes,
                    conv_strides=conv_strides,
                    poolings=poolings,
                    dropout_input=dropout_input,
                    dropout_hidden=dropout_encoder,
                    use_cuda=self.use_cuda,
                    activation=activation,
                    batch_norm=batch_norm)
            else:
                raise NotImplementedError

            ##################################################
            # Fully-connected layers
            ##################################################
            if len(fc_list) > 0:
                for i in range(len(fc_list)):
                    if i == 0:
                        if encoder_type == 'cnn':
                            bottle_input_size = self.encoder.output_size
                        else:
                            bottle_input_size = self.encoder_num_units

                        # if batch_norm:
                        #     setattr(self, 'bn_fc_0',
                        #             L.BatchNormalization(bottle_input_size))

                        setattr(
                            self, 'fc_0',
                            LinearND(bottle_input_size,
                                     fc_list[i],
                                     dropout=dropout_encoder,
                                     use_cuda=self.use_cuda))
                    else:
                        # if batch_norm:
                        #     setattr(self, 'bn_fc_' + str(i),
                        #             L.BatchNormalization(fc_list[i - 1]))

                        setattr(
                            self, 'fc_' + str(i),
                            LinearND(fc_list[i - 1],
                                     fc_list[i],
                                     dropout=dropout_encoder,
                                     use_cuda=self.use_cuda))
                # TODO: remove a bias term in the case of batch normalization

                self.fc_out = LinearND(fc_list[-1],
                                       self.num_classes,
                                       use_cuda=self.use_cuda)
            else:
                self.fc_out = LinearND(self.encoder_num_units,
                                       self.num_classes,
                                       use_cuda=self.use_cuda)

            ##################################################
            # Initialize parameters
            ##################################################
            self.init_weights(parameter_init,
                              distribution=parameter_init_distribution,
                              ignore_keys=['bias'])

            # Initialize all biases with 0
            self.init_weights(0, distribution='constant', keys=['bias'])

            # Recurrent weights are orthogonalized
            if recurrent_weight_orthogonal and encoder_type != 'cnn':
                self.init_weights(parameter_init,
                                  distribution='orthogonal',
                                  keys=[encoder_type, 'weight'],
                                  ignore_keys=['bias'])

            # Initialize bias in forget gate with 1
            if init_forget_gate_bias_with_one:
                self.init_forget_gate_bias_with_one()

        # Set CTC decoders
        self._decode_greedy_np = GreedyDecoder(blank_index=0)
        self._decode_beam_np = BeamSearchDecoder(blank_index=0)
Esempio n. 4
0
    def __init__(
            self,
            input_size,
            encoder_type,
            encoder_bidirectional,
            encoder_num_units,
            encoder_num_proj,
            encoder_num_layers,
            encoder_num_layers_sub,  # ***
            fc_list,
            fc_list_sub,
            dropout_input,
            dropout_encoder,
            main_loss_weight,  # ***
            sub_loss_weight,  # ***
            num_classes,
            num_classes_sub,  # ***
            parameter_init_distribution='uniform',
            parameter_init=0.1,
            recurrent_weight_orthogonal=False,
            init_forget_gate_bias_with_one=True,
            subsample_list=[],
            subsample_type='drop',
            logits_temperature=1,
            num_stack=1,
            splice=1,
            input_channel=1,
            conv_channels=[],
            conv_kernel_sizes=[],
            conv_strides=[],
            poolings=[],
            activation='relu',
            batch_norm=False,
            label_smoothing_prob=0,
            weight_noise_std=0,
            encoder_residual=False,
            encoder_dense_residual=False):

        super(HierarchicalCTC,
              self).__init__(input_size=input_size,
                             encoder_type=encoder_type,
                             encoder_bidirectional=encoder_bidirectional,
                             encoder_num_units=encoder_num_units,
                             encoder_num_proj=encoder_num_proj,
                             encoder_num_layers=encoder_num_layers,
                             dropout_input=dropout_input,
                             dropout_encoder=dropout_encoder,
                             num_classes=num_classes,
                             parameter_init=parameter_init,
                             subsample_list=subsample_list,
                             subsample_type=subsample_type,
                             fc_list=fc_list,
                             num_stack=num_stack,
                             splice=splice,
                             input_channel=input_channel,
                             conv_channels=conv_channels,
                             conv_kernel_sizes=conv_kernel_sizes,
                             conv_strides=conv_strides,
                             poolings=poolings,
                             logits_temperature=logits_temperature,
                             batch_norm=batch_norm,
                             label_smoothing_prob=label_smoothing_prob,
                             weight_noise_std=weight_noise_std)
        self.model_type = 'hierarchical_ctc'

        # Setting for the encoder
        self.encoder_num_layers_sub = encoder_num_layers_sub
        self.fc_list_sub = fc_list_sub

        # Setting for CTC
        self.num_classes_sub = num_classes_sub + 1  # Add the blank class

        # Setting for MTL
        self.main_loss_weight = main_loss_weight
        self.sub_loss_weight = sub_loss_weight

        with self.init_scope():
            # Overide
            delattr(self, 'encoder')

            # Load the encoder
            if encoder_type in ['lstm', 'gru', 'rnn']:
                self.encoder = load(encoder_type=encoder_type)(
                    input_size=input_size,  # 120 or 123
                    rnn_type=encoder_type,
                    bidirectional=encoder_bidirectional,
                    num_units=encoder_num_units,
                    num_proj=encoder_num_proj,
                    num_layers=encoder_num_layers,
                    num_layers_sub=encoder_num_layers_sub,
                    dropout_input=dropout_input,
                    dropout_hidden=dropout_encoder,
                    subsample_list=subsample_list,
                    subsample_type=subsample_type,
                    use_cuda=self.use_cuda,
                    merge_bidirectional=False,
                    num_stack=num_stack,
                    splice=splice,
                    input_channel=input_channel,
                    conv_channels=conv_channels,
                    conv_kernel_sizes=conv_kernel_sizes,
                    conv_strides=conv_strides,
                    poolings=poolings,
                    activation=activation,
                    batch_norm=batch_norm,
                    residual=encoder_residual,
                    dense_residual=encoder_dense_residual)
            elif encoder_type == 'cnn':
                assert num_stack == 1 and splice == 1
                self.encoder = load(encoder_type='cnn')(
                    input_size=input_size,
                    input_channel=input_channel,
                    conv_channels=conv_channels,
                    conv_kernel_sizes=conv_kernel_sizes,
                    conv_strides=conv_strides,
                    poolings=poolings,
                    dropout_input=dropout_input,
                    dropout_hidden=dropout_encoder,
                    use_cuda=self.use_cuda,
                    activation=activation,
                    batch_norm=batch_norm)
            else:
                raise NotImplementedError

            ##################################################
            # Fully-connected layers in the main task
            ##################################################
            # Overide
            delattr(self, 'fc_out')
            if len(fc_list) > 0:
                for i in range(len(fc_list)):
                    # Overide
                    delattr(self, 'fc_' + str(i))

                    if i == 0:
                        if encoder_type == 'cnn':
                            bottle_input_size = self.encoder.output_size
                        else:
                            bottle_input_size = self.encoder_num_units

                        # TODO: add batch norm layers

                        setattr(
                            self, 'fc_' + str(i),
                            LinearND(bottle_input_size,
                                     fc_list[i],
                                     dropout=dropout_encoder,
                                     use_cuda=self.use_cuda))
                    else:
                        # TODO: add batch norm layers

                        setattr(
                            self, 'fc_' + str(i),
                            LinearND(fc_list[i - 1],
                                     fc_list[i],
                                     dropout=dropout_encoder,
                                     use_cuda=self.use_cuda))
                # TODO: remove a bias term in the case of batch normalization

                self.fc_out = LinearND(fc_list[-1],
                                       self.num_classes,
                                       use_cuda=self.use_cuda)
            else:
                self.fc_out = LinearND(self.encoder_num_units,
                                       self.num_classes,
                                       use_cuda=self.use_cuda)

            ##################################################
            # Fully-connected layers in the sub task
            ##################################################
            if len(fc_list_sub) > 0:
                for i in range(len(fc_list_sub)):
                    if i == 0:
                        if encoder_type == 'cnn':
                            bottle_input_size = self.encoder.output_size
                        else:
                            bottle_input_size = self.encoder_num_units

                        # TODO: add batch norm layers

                        setattr(
                            self, 'fc_sub_' + str(i),
                            LinearND(bottle_input_size,
                                     fc_list_sub[i],
                                     dropout=dropout_encoder,
                                     use_cuda=self.use_cuda))
                    else:
                        # TODO: add batch norm layers

                        setattr(
                            self, 'fc_sub_' + str(i),
                            LinearND(fc_list_sub[i - 1],
                                     fc_list_sub[i],
                                     dropout=dropout_encoder,
                                     use_cuda=self.use_cuda))
                # TODO: remove a bias term in the case of batch normalization

                self.fc_out_sub = LinearND(fc_list_sub[-1],
                                           self.num_classes_sub,
                                           use_cuda=self.use_cuda)
            else:
                self.fc_out_sub = LinearND(self.encoder_num_units,
                                           self.num_classes_sub,
                                           use_cuda=self.use_cuda)

            ##################################################
            # Initialize parameters
            ##################################################
            self.init_weights(parameter_init,
                              distribution=parameter_init_distribution,
                              ignore_keys=['bias'])

            # Initialize all biases with 0
            self.init_weights(0, distribution='constant', keys=['bias'])

            # Recurrent weights are orthogonalized
            if recurrent_weight_orthogonal:
                self.init_weights(parameter_init,
                                  distribution='orthogonal',
                                  keys=['lstm', 'weight'],
                                  ignore_keys=['bias'])

            # Initialize bias in forget gate with 1
            if init_forget_gate_bias_with_one:
                self.init_forget_gate_bias_with_one()
Esempio n. 5
0
    def __init__(
            self,
            input_size,
            encoder_type,
            encoder_bidirectional,
            encoder_num_units,
            encoder_num_proj,
            encoder_num_layers,
            encoder_num_layers_sub,  # ***
            attention_type,
            attention_dim,
            decoder_type,
            decoder_num_units,
            decoder_num_layers,
            decoder_num_units_sub,  # ***
            decoder_num_layers_sub,  # ***
            embedding_dim,
            embedding_dim_sub,  # ***
            dropout_input,
            dropout_encoder,
            dropout_decoder,
            dropout_embedding,
            main_loss_weight,  # ***
            sub_loss_weight,  # ***
            num_classes,
            num_classes_sub,  # ***
            parameter_init_distribution='uniform',
            parameter_init=0.1,
            recurrent_weight_orthogonal=False,
            init_forget_gate_bias_with_one=True,
            subsample_list=[],
            subsample_type='drop',
            bridge_layer=False,
            init_dec_state='first',
            sharpening_factor=1,
            logits_temperature=1,
            sigmoid_smoothing=False,
            coverage_weight=0,
            ctc_loss_weight_sub=0,  # ***
            attention_conv_num_channels=10,
            attention_conv_width=201,
            num_stack=1,
            splice=1,
            input_channel=1,
            conv_channels=[],
            conv_kernel_sizes=[],
            conv_strides=[],
            poolings=[],
            activation='relu',
            batch_norm=False,
            scheduled_sampling_prob=0,
            scheduled_sampling_max_step=0,
            label_smoothing_prob=0,
            weight_noise_std=0,
            encoder_residual=False,
            encoder_dense_residual=False,
            decoder_residual=False,
            decoder_dense_residual=False,
            decoding_order='attend_generate_update',
            bottleneck_dim=256,
            bottleneck_dim_sub=256,  # ***
            backward_sub=False,  # ***
            num_heads=1,
            num_heads_sub=1):  # ***

        super(HierarchicalAttentionSeq2seq, self).__init__(
            input_size=input_size,
            encoder_type=encoder_type,
            encoder_bidirectional=encoder_bidirectional,
            encoder_num_units=encoder_num_units,
            encoder_num_proj=encoder_num_proj,
            encoder_num_layers=encoder_num_layers,
            attention_type=attention_type,
            attention_dim=attention_dim,
            decoder_type=decoder_type,
            decoder_num_units=decoder_num_units,
            decoder_num_layers=decoder_num_layers,
            embedding_dim=embedding_dim,
            dropout_input=dropout_input,
            dropout_encoder=dropout_encoder,
            dropout_decoder=dropout_decoder,
            dropout_embedding=dropout_embedding,
            num_classes=num_classes,
            parameter_init=parameter_init,
            subsample_list=subsample_list,
            subsample_type=subsample_type,
            bridge_layer=bridge_layer,
            init_dec_state=init_dec_state,
            sharpening_factor=sharpening_factor,
            logits_temperature=logits_temperature,
            sigmoid_smoothing=sigmoid_smoothing,
            coverage_weight=coverage_weight,
            ctc_loss_weight=0,
            attention_conv_num_channels=attention_conv_num_channels,
            attention_conv_width=attention_conv_width,
            num_stack=num_stack,
            splice=splice,
            input_channel=input_channel,
            conv_channels=conv_channels,
            conv_kernel_sizes=conv_kernel_sizes,
            conv_strides=conv_strides,
            poolings=poolings,
            scheduled_sampling_prob=scheduled_sampling_prob,
            scheduled_sampling_max_step=scheduled_sampling_max_step,
            label_smoothing_prob=label_smoothing_prob,
            weight_noise_std=weight_noise_std,
            encoder_residual=encoder_residual,
            encoder_dense_residual=encoder_dense_residual,
            decoder_residual=decoder_residual,
            decoder_dense_residual=decoder_dense_residual,
            decoding_order=decoding_order,
            bottleneck_dim=bottleneck_dim,
            backward_loss_weight=0,
            num_heads=num_heads)
        self.model_type = 'hierarchical_attention'

        # Setting for the encoder
        self.encoder_num_units_sub = encoder_num_units
        if encoder_bidirectional:
            self.encoder_num_units_sub *= 2

        # Setting for the decoder in the sub task
        self.decoder_num_units_1 = decoder_num_units_sub
        self.decoder_num_layers_1 = decoder_num_layers_sub
        self.num_classes_sub = num_classes_sub + 1  # Add <EOS> class
        self.sos_1 = num_classes_sub
        self.eos_1 = num_classes_sub
        # NOTE: <SOS> and <EOS> have the same index
        self.backward_1 = backward_sub

        # Setting for the decoder initialization in the sub task
        if backward_sub:
            if init_dec_state == 'first':
                self.init_dec_state_1_bwd = 'final'
            elif init_dec_state == 'final':
                self.init_dec_state_1_bwd = 'first'
            else:
                self.init_dec_state_1_bwd = init_dec_state
            if encoder_type != decoder_type:
                self.init_dec_state_1_bwd = 'zero'
        else:
            self.init_dec_state_1_fwd = init_dec_state
            if encoder_type != decoder_type:
                self.init_dec_state_1_fwd = 'zero'

        # Setting for the attention in the sub task
        self.num_heads_1 = num_heads_sub

        # Setting for MTL
        self.main_loss_weight = main_loss_weight
        self.sub_loss_weight = sub_loss_weight
        self.ctc_loss_weight_sub = ctc_loss_weight_sub
        if backward_sub:
            self.bwd_weight_1 = sub_loss_weight

        with self.init_scope():
            # Overide encoder
            delattr(self, 'encoder')

            ##############################
            # Encoder
            ##############################
            if encoder_type in ['lstm', 'gru', 'rnn']:
                self.encoder = load(encoder_type=encoder_type)(
                    input_size=input_size,
                    rnn_type=encoder_type,
                    bidirectional=encoder_bidirectional,
                    num_units=encoder_num_units,
                    num_proj=encoder_num_proj,
                    num_layers=encoder_num_layers,
                    num_layers_sub=encoder_num_layers_sub,
                    dropout_input=dropout_input,
                    dropout_hidden=dropout_encoder,
                    subsample_list=subsample_list,
                    subsample_type=subsample_type,
                    use_cuda=self.use_cuda,
                    merge_bidirectional=False,
                    num_stack=num_stack,
                    splice=splice,
                    input_channel=input_channel,
                    conv_channels=conv_channels,
                    conv_kernel_sizes=conv_kernel_sizes,
                    conv_strides=conv_strides,
                    poolings=poolings,
                    activation=activation,
                    batch_norm=batch_norm,
                    residual=encoder_residual,
                    dense_residual=encoder_dense_residual)
            elif encoder_type == 'cnn':
                assert num_stack == 1 and splice == 1
                self.encoder = load(encoder_type='cnn')(
                    input_size=input_size,
                    input_channel=input_channel,
                    conv_channels=conv_channels,
                    conv_kernel_sizes=conv_kernel_sizes,
                    conv_strides=conv_strides,
                    poolings=poolings,
                    dropout_input=dropout_input,
                    dropout_hidden=dropout_encoder,
                    use_cuda=self.use_cuda,
                    activation=activation,
                    batch_norm=batch_norm)
                self.init_dec_state_0 = 'zero'
                self.init_dec_state_1 = 'zero'
            else:
                raise NotImplementedError

            dir = 'bwd' if backward_sub else 'fwd'
            self.is_bridge_sub = False
            if self.sub_loss_weight > 0:

                ##################################################
                # Bridge layer between the encoder and decoder
                ##################################################
                if encoder_type == 'cnn':
                    self.bridge_1 = LinearND(self.encoder.output_size,
                                             decoder_num_units_sub,
                                             dropout=dropout_encoder,
                                             use_cuda=self.use_cuda)
                    self.encoder_num_units_sub = decoder_num_units_sub
                    self.is_bridge_sub = True
                elif bridge_layer:
                    self.bridge_1 = LinearND(self.encoder_num_units_sub,
                                             decoder_num_units_sub,
                                             dropout=dropout_encoder,
                                             use_cuda=self.use_cuda)
                    self.encoder_num_units_sub = decoder_num_units_sub
                    self.is_bridge_sub = True
                else:
                    self.is_bridge_sub = False

                ##################################################
                # Initialization of the decoder
                ##################################################
                if getattr(self, 'init_dec_state_1_' + dir) != 'zero':
                    setattr(
                        self, 'W_dec_init_1_' + dir,
                        LinearND(self.encoder_num_units_sub,
                                 decoder_num_units_sub,
                                 use_cuda=self.use_cuda))

                ##############################
                # Decoder (sub)
                ##############################
                if decoding_order == 'conditional':
                    setattr(
                        self, 'decoder_first_1_' + dir,
                        RNNDecoder(input_size=embedding_dim_sub,
                                   rnn_type=decoder_type,
                                   num_units=decoder_num_units_sub,
                                   num_layers=1,
                                   dropout=dropout_decoder,
                                   use_cuda=self.use_cuda,
                                   residual=False,
                                   dense_residual=False))
                    setattr(
                        self, 'decoder_second_1_' + dir,
                        RNNDecoder(input_size=self.encoder_num_units_sub,
                                   rnn_type=decoder_type,
                                   num_units=decoder_num_units_sub,
                                   num_layers=1,
                                   dropout=dropout_decoder,
                                   use_cuda=self.use_cuda,
                                   residual=False,
                                   dense_residual=False))
                    # NOTE; the conditional decoder only supports the 1 layer
                else:
                    setattr(
                        self, 'decoder_1_' + dir,
                        RNNDecoder(input_size=self.encoder_num_units_sub +
                                   embedding_dim_sub,
                                   rnn_type=decoder_type,
                                   num_units=decoder_num_units_sub,
                                   num_layers=decoder_num_layers_sub,
                                   dropout=dropout_decoder,
                                   use_cuda=self.use_cuda,
                                   residual=decoder_residual,
                                   dense_residual=decoder_dense_residual))

                ###################################
                # Attention layer (sub)
                ###################################
                setattr(
                    self, 'attend_1_' + dir,
                    AttentionMechanism(
                        encoder_num_units=self.encoder_num_units_sub,
                        decoder_num_units=decoder_num_units_sub,
                        attention_type=attention_type,
                        attention_dim=attention_dim,
                        use_cuda=self.use_cuda,
                        sharpening_factor=sharpening_factor,
                        sigmoid_smoothing=sigmoid_smoothing,
                        out_channels=attention_conv_num_channels,
                        kernel_size=attention_conv_width,
                        num_heads=num_heads_sub))

                ##############################
                # Output layer (sub)
                ##############################
                setattr(
                    self, 'W_d_1_' + dir,
                    LinearND(decoder_num_units_sub,
                             bottleneck_dim_sub,
                             dropout=dropout_decoder,
                             use_cuda=self.use_cuda))
                setattr(
                    self, 'W_c_1_' + dir,
                    LinearND(self.encoder_num_units_sub,
                             bottleneck_dim_sub,
                             dropout=dropout_decoder,
                             use_cuda=self.use_cuda))
                setattr(
                    self, 'fc_1_' + dir,
                    LinearND(bottleneck_dim_sub,
                             self.num_classes_sub,
                             use_cuda=self.use_cuda))

                ##############################
                # Embedding (sub)
                ##############################
                if label_smoothing_prob > 0:
                    self.embed_1 = Embedding_LS(
                        num_classes=self.num_classes_sub,
                        embedding_dim=embedding_dim_sub,
                        dropout=dropout_embedding,
                        label_smoothing_prob=label_smoothing_prob,
                        use_cuda=self.use_cuda)
                else:
                    self.embed_1 = Embedding(num_classes=self.num_classes_sub,
                                             embedding_dim=embedding_dim_sub,
                                             dropout=dropout_embedding,
                                             ignore_index=self.sos_sub,
                                             use_cuda=self.use_cuda)

            ##############################
            # CTC (sub)
            ##############################
            if ctc_loss_weight_sub > 0:
                self.fc_ctc_1 = LinearND(self.encoder_num_units_sub,
                                         num_classes_sub + 1,
                                         use_cuda=self.use_cuda)

                # Set CTC decoders
                self._decode_ctc_greedy_np = GreedyDecoder(blank_index=0)
                self._decode_ctc_beam_np = BeamSearchDecoder(blank_index=0)

            ##################################################
            # Initialize parameters
            ##################################################
            self.init_weights(parameter_init,
                              distribution=parameter_init_distribution,
                              ignore_keys=['bias'])

            # Initialize all biases with 0
            self.init_weights(0, distribution='constant', keys=['bias'])

            # Recurrent weights are orthogonalized
            if recurrent_weight_orthogonal:
                self.init_weights(parameter_init,
                                  distribution='orthogonal',
                                  keys=[encoder_type, 'weight'],
                                  ignore_keys=['bias'])
                self.init_weights(parameter_init,
                                  distribution='orthogonal',
                                  keys=[decoder_type, 'weight'],
                                  ignore_keys=['bias'])

            # Initialize bias in forget gate with 1
            if init_forget_gate_bias_with_one:
                self.init_forget_gate_bias_with_one()