def __init__(
            self,
            input_size,
            encoder_type,
            encoder_bidirectional,
            encoder_num_units,
            encoder_num_proj,
            encoder_num_layers,
            encoder_num_layers_sub,  # ***
            fc_list,
            fc_list_sub,
            dropout_input,
            dropout_encoder,
            main_loss_weight,  # ***
            sub_loss_weight,  # ***
            num_classes,
            num_classes_sub,  # ***
            parameter_init_distribution='uniform',
            parameter_init=0.1,
            recurrent_weight_orthogonal=False,
            init_forget_gate_bias_with_one=True,
            subsample_list=[],
            subsample_type='drop',
            logits_temperature=1,
            num_stack=1,
            splice=1,
            input_channel=1,
            conv_channels=[],
            conv_kernel_sizes=[],
            conv_strides=[],
            poolings=[],
            activation='relu',
            batch_norm=False,
            label_smoothing_prob=0,
            weight_noise_std=0,
            encoder_residual=False,
            encoder_dense_residual=False):

        super(HierarchicalCTC,
              self).__init__(input_size=input_size,
                             encoder_type=encoder_type,
                             encoder_bidirectional=encoder_bidirectional,
                             encoder_num_units=encoder_num_units,
                             encoder_num_proj=encoder_num_proj,
                             encoder_num_layers=encoder_num_layers,
                             dropout_input=dropout_input,
                             dropout_encoder=dropout_encoder,
                             num_classes=num_classes,
                             parameter_init=parameter_init,
                             subsample_list=subsample_list,
                             subsample_type=subsample_type,
                             fc_list=fc_list,
                             num_stack=num_stack,
                             splice=splice,
                             input_channel=input_channel,
                             conv_channels=conv_channels,
                             conv_kernel_sizes=conv_kernel_sizes,
                             conv_strides=conv_strides,
                             poolings=poolings,
                             logits_temperature=logits_temperature,
                             batch_norm=batch_norm,
                             label_smoothing_prob=label_smoothing_prob,
                             weight_noise_std=weight_noise_std)
        self.model_type = 'hierarchical_ctc'

        # Setting for the encoder
        self.encoder_num_layers_sub = encoder_num_layers_sub
        self.fc_list_sub = fc_list_sub

        # Setting for CTC
        self.num_classes_sub = num_classes_sub + 1  # Add the blank class

        # Setting for MTL
        self.main_loss_weight = main_loss_weight
        self.sub_loss_weight = sub_loss_weight

        # Load the encoder
        # NOTE: overide encoder
        if encoder_type in ['lstm', 'gru', 'rnn']:
            self.encoder = load(encoder_type=encoder_type)(
                input_size=input_size,
                rnn_type=encoder_type,
                bidirectional=encoder_bidirectional,
                num_units=encoder_num_units,
                num_proj=encoder_num_proj,
                num_layers=encoder_num_layers,
                num_layers_sub=encoder_num_layers_sub,
                dropout_input=dropout_input,
                dropout_hidden=dropout_encoder,
                subsample_list=subsample_list,
                subsample_type=subsample_type,
                batch_first=True,
                merge_bidirectional=False,
                pack_sequence=True,
                num_stack=num_stack,
                splice=splice,
                input_channel=input_channel,
                conv_channels=conv_channels,
                conv_kernel_sizes=conv_kernel_sizes,
                conv_strides=conv_strides,
                poolings=poolings,
                activation=activation,
                batch_norm=batch_norm,
                residual=encoder_residual,
                dense_residual=encoder_dense_residual)
        elif encoder_type == 'cnn':
            assert num_stack == 1 and splice == 1
            self.encoder = load(encoder_type='cnn')(
                input_size=input_size,
                input_channel=input_channel,
                conv_channels=conv_channels,
                conv_kernel_sizes=conv_kernel_sizes,
                conv_strides=conv_strides,
                poolings=poolings,
                dropout_input=dropout_input,
                dropout_hidden=dropout_encoder,
                activation=activation,
                batch_norm=batch_norm)
        else:
            raise NotImplementedError

        ##################################################
        # Fully-connected layers in the main task
        ##################################################
        if len(fc_list) > 0:
            for i in range(len(fc_list)):
                if i == 0:
                    if encoder_type == 'cnn':
                        bottle_input_size = self.encoder.output_size
                    else:
                        bottle_input_size = self.encoder_num_units

                    # TODO: add batch norm layers

                    setattr(
                        self, 'fc_0',
                        LinearND(bottle_input_size,
                                 fc_list[i],
                                 dropout=dropout_encoder))
                else:
                    # TODO: add batch norm layers

                    setattr(
                        self, 'fc_' + str(i),
                        LinearND(fc_list[i - 1],
                                 fc_list[i],
                                 dropout=dropout_encoder))
            # TODO: remove a bias term in the case of batch normalization

            self.fc_out = LinearND(fc_list[-1], self.num_classes)
        else:
            self.fc_out = LinearND(self.encoder_num_units, self.num_classes)

        ##################################################
        # Fully-connected layers in the sub task
        ##################################################
        if len(fc_list_sub) > 0:
            for i in range(len(fc_list_sub)):
                if i == 0:
                    if encoder_type == 'cnn':
                        bottle_input_size = self.encoder.output_size
                    else:
                        bottle_input_size = self.encoder_num_units

                    # TODO: add batch norm layers

                    setattr(
                        self, 'fc_sub_0',
                        LinearND(bottle_input_size,
                                 fc_list_sub[i],
                                 dropout=dropout_encoder))
                else:
                    # TODO: add batch norm layers

                    setattr(
                        self, 'fc_sub_' + str(i),
                        LinearND(fc_list_sub[i - 1],
                                 fc_list_sub[i],
                                 dropout=dropout_encoder))
            # TODO: remove a bias term in the case of batch normalization

            self.fc_out_sub = LinearND(fc_list_sub[-1], self.num_classes_sub)
        else:
            self.fc_out_sub = LinearND(self.encoder_num_units,
                                       self.num_classes_sub)

        ##################################################
        # Initialize parameters
        ##################################################
        self.init_weights(parameter_init,
                          distribution=parameter_init_distribution,
                          ignore_keys=['bias'])

        # Initialize all biases with 0
        self.init_weights(0, distribution='constant', keys=['bias'])

        # Recurrent weights are orthogonalized
        if recurrent_weight_orthogonal:
            self.init_weights(parameter_init,
                              distribution='orthogonal',
                              keys=['lstm', 'weight'],
                              ignore_keys=['bias'])

        # Initialize bias in forget gate with 1
        if init_forget_gate_bias_with_one:
            self.init_forget_gate_bias_with_one()
    def check(self,
              encoder_type,
              bidirectional=False,
              batch_first=True,
              subsample_type='concat',
              conv=False,
              merge_bidirectional=False,
              projection=False,
              residual=False,
              dense_residual=False):

        print('==================================================')
        print('  encoder_type: %s' % encoder_type)
        print('  bidirectional: %s' % str(bidirectional))
        print('  batch_first: %s' % str(batch_first))
        print('  subsample_type: %s' % subsample_type)
        print('  conv: %s' % str(conv))
        print('  merge_bidirectional: %s' % str(merge_bidirectional))
        print('  projection: %s' % str(projection))
        print('  residual: %s' % str(residual))
        print('  dense_residual: %s' % str(dense_residual))
        print('==================================================')

        if conv:
            # pattern 1
            # conv_channels = [32, 32]
            # conv_kernel_sizes = [[41, 11], [21, 11]]
            # conv_strides = [[2, 2], [2, 1]]
            # poolings = [[], []]

            # pattern 2 (VGG like)
            conv_channels = [64, 64]
            conv_kernel_sizes = [[3, 3], [3, 3]]
            conv_strides = [[1, 1], [1, 1]]
            poolings = [[2, 2], [2, 2]]
        else:
            conv_channels = []
            conv_kernel_sizes = []
            conv_strides = []
            poolings = []

        # Load batch data
        batch_size = 4
        splice = 1
        num_stack = 1
        xs, _, x_lens, _ = generate_data(batch_size=batch_size,
                                         num_stack=num_stack,
                                         splice=splice)

        # Wrap by Tensor
        xs = torch.from_numpy(xs)
        x_lens = torch.from_numpy(x_lens)

        # Load encoder
        encoder = load(encoder_type=encoder_type)

        # Initialize encoder
        if encoder_type in ['lstm', 'gru', 'rnn']:
            encoder = encoder(
                input_size=xs.size(-1) // splice // num_stack,  # 120
                rnn_type=encoder_type,
                bidirectional=bidirectional,
                num_units=256,
                num_proj=256 if projection else 0,
                num_layers=6,
                num_layers_sub=4,
                dropout_input=0.2,
                dropout_hidden=0.2,
                subsample_list=[False, True, True, False, False, False],
                subsample_type=subsample_type,
                batch_first=batch_first,
                merge_bidirectional=merge_bidirectional,
                splice=splice,
                num_stack=num_stack,
                conv_channels=conv_channels,
                conv_kernel_sizes=conv_kernel_sizes,
                conv_strides=conv_strides,
                poolings=poolings,
                batch_norm=True,
                residual=residual,
                dense_residual=dense_residual)
        else:
            raise NotImplementedError

        max_time = xs.size(1)
        if conv:
            max_time = encoder.conv.get_conv_out_size(max_time, 1)
        max_time_sub = max_time // \
            (2 ** sum(encoder.subsample_list[:encoder.num_layers_sub]))
        max_time //= (2**sum(encoder.subsample_list))
        if subsample_type == 'drop':
            max_time_sub = math.ceil(max_time_sub)
            max_time = math.ceil(max_time)
        elif subsample_type == 'concat':
            max_time_sub = int(max_time_sub)
            max_time = int(max_time)

        outputs, _, outputs_sub, _, perm_indices = encoder(xs, x_lens)

        print('----- outputs -----')
        print(xs.size())
        print(outputs_sub.size())
        print(outputs.size())
        num_directions = 2 if bidirectional and not merge_bidirectional else 1
        if batch_first:
            self.assertEqual(
                (batch_size, max_time_sub, encoder.num_units * num_directions),
                outputs_sub.size())
            self.assertEqual(
                (batch_size, max_time, encoder.num_units * num_directions),
                outputs.size())
        else:
            self.assertEqual(
                (max_time_sub, batch_size, encoder.num_units * num_directions),
                outputs_sub.size())
            self.assertEqual(
                (max_time, batch_size, encoder.num_units * num_directions),
                outputs.size())
コード例 #3
0
    def __init__(self,
                 input_size,
                 encoder_type,
                 encoder_bidirectional,
                 encoder_num_units,
                 encoder_num_proj,
                 encoder_num_layers,
                 fc_list,
                 dropout_input,
                 dropout_encoder,
                 num_classes,
                 parameter_init_distribution='uniform',
                 parameter_init=0.1,
                 recurrent_weight_orthogonal=False,
                 init_forget_gate_bias_with_one=True,
                 subsample_list=[],
                 subsample_type='drop',
                 logits_temperature=1,
                 num_stack=1,
                 splice=1,
                 input_channel=1,
                 conv_channels=[],
                 conv_kernel_sizes=[],
                 conv_strides=[],
                 poolings=[],
                 activation='relu',
                 batch_norm=False,
                 label_smoothing_prob=0,
                 weight_noise_std=0,
                 encoder_residual=False,
                 encoder_dense_residual=False):

        super(ModelBase, self).__init__()
        self.model_type = 'ctc'

        # Setting for the encoder
        self.input_size = input_size
        self.num_stack = num_stack
        self.encoder_type = encoder_type
        self.encoder_num_units = encoder_num_units
        if encoder_bidirectional:
            self.encoder_num_units *= 2
        self.fc_list = fc_list
        self.subsample_list = subsample_list

        # Setting for CTC
        self.num_classes = num_classes + 1  # Add the blank class
        self.logits_temperature = logits_temperature

        # Setting for regualarization
        self.weight_noise_injection = False
        self.weight_noise_std = float(weight_noise_std)
        self.ls_prob = label_smoothing_prob

        # Call the encoder function
        if encoder_type in ['lstm', 'gru', 'rnn']:
            self.encoder = load(encoder_type=encoder_type)(
                input_size=input_size,
                rnn_type=encoder_type,
                bidirectional=encoder_bidirectional,
                num_units=encoder_num_units,
                num_proj=encoder_num_proj,
                num_layers=encoder_num_layers,
                dropout_input=dropout_input,
                dropout_hidden=dropout_encoder,
                subsample_list=subsample_list,
                subsample_type=subsample_type,
                batch_first=True,
                merge_bidirectional=False,
                pack_sequence=True,
                num_stack=num_stack,
                splice=splice,
                input_channel=input_channel,
                conv_channels=conv_channels,
                conv_kernel_sizes=conv_kernel_sizes,
                conv_strides=conv_strides,
                poolings=poolings,
                activation=activation,
                batch_norm=batch_norm,
                residual=encoder_residual,
                dense_residual=encoder_dense_residual,
                nin=0)
        elif encoder_type == 'cnn':
            assert num_stack == 1 and splice == 1
            self.encoder = load(encoder_type='cnn')(
                input_size=input_size,
                input_channel=input_channel,
                conv_channels=conv_channels,
                conv_kernel_sizes=conv_kernel_sizes,
                conv_strides=conv_strides,
                poolings=poolings,
                dropout_input=dropout_input,
                dropout_hidden=dropout_encoder,
                activation=activation,
                batch_norm=batch_norm)
        else:
            raise NotImplementedError

        ##################################################
        # Fully-connected layers
        ##################################################
        if len(fc_list) > 0:
            for i in range(len(fc_list)):
                if i == 0:
                    if encoder_type == 'cnn':
                        bottle_input_size = self.encoder.output_size
                    else:
                        bottle_input_size = self.encoder_num_units

                    # if batch_norm:
                    #     setattr(self, 'bn_fc_0', nn.BatchNorm1d(
                    #         bottle_input_size))

                    setattr(
                        self, 'fc_0',
                        LinearND(bottle_input_size,
                                 fc_list[i],
                                 dropout=dropout_encoder))
                else:
                    # if batch_norm:
                    #     setattr(self, 'fc_bn_' + str(i),
                    #             nn.BatchNorm1d(fc_list[i - 1]))

                    setattr(
                        self, 'fc_' + str(i),
                        LinearND(fc_list[i - 1],
                                 fc_list[i],
                                 dropout=dropout_encoder))
            # TODO: remove a bias term in the case of batch normalization

            self.fc_out = LinearND(fc_list[-1], self.num_classes)
        else:
            self.fc_out = LinearND(self.encoder_num_units, self.num_classes)

        ##################################################
        # Initialize parameters
        ##################################################
        self.init_weights(parameter_init,
                          distribution=parameter_init_distribution,
                          ignore_keys=['bias'])

        # Initialize all biases with 0
        self.init_weights(0, distribution='constant', keys=['bias'])

        # Recurrent weights are orthogonalized
        if recurrent_weight_orthogonal and encoder_type != 'cnn':
            self.init_weights(parameter_init,
                              distribution='orthogonal',
                              keys=[encoder_type, 'weight'],
                              ignore_keys=['bias'])

        # Initialize bias in forget gate with 1
        if init_forget_gate_bias_with_one:
            self.init_forget_gate_bias_with_one()

        # Set CTC decoders
        self._decode_greedy_np = GreedyDecoder(blank_index=0)
        self._decode_beam_np = BeamSearchDecoder(blank_index=0)
コード例 #4
0
    def __init__(
            self,
            input_size,
            encoder_type,
            encoder_bidirectional,
            encoder_num_units,
            encoder_num_proj,
            encoder_num_layers,
            encoder_num_layers_sub,  # ***
            attention_type,
            attention_dim,
            decoder_type,
            decoder_num_units,
            decoder_num_units_sub,  # ***
            decoder_num_layers,
            decoder_num_layers_sub,  # ***
            embedding_dim,
            embedding_dim_sub,  # ***
            dropout_input,
            dropout_encoder,
            dropout_decoder,
            dropout_embedding,
            main_loss_weight,  # ***
            sub_loss_weight,  # ***
            num_classes,
            num_classes_sub,  # ***
            parameter_init_distribution='uniform',
            parameter_init=0.1,
            recurrent_weight_orthogonal=False,
            init_forget_gate_bias_with_one=True,
            subsample_list=[],
            subsample_type='drop',
            bridge_layer=False,
            init_dec_state='first',
            sharpening_factor=1,
            logits_temperature=1,
            sigmoid_smoothing=False,
            coverage_weight=0,
            ctc_loss_weight_sub=0,  # ***
            attention_conv_num_channels=10,
            attention_conv_width=201,
            num_stack=1,
            splice=1,
            input_channel=1,
            conv_channels=[],
            conv_kernel_sizes=[],
            conv_strides=[],
            poolings=[],
            activation='relu',
            batch_norm=False,
            scheduled_sampling_prob=0,
            scheduled_sampling_max_step=0,
            label_smoothing_prob=0,
            weight_noise_std=0,
            encoder_residual=False,
            encoder_dense_residual=False,
            decoder_residual=False,
            decoder_dense_residual=False,
            decoding_order='attend_generate_update',
            bottleneck_dim=256,
            bottleneck_dim_sub=256,  # ***
            backward_sub=False,  # ***
            num_heads=1,
            num_heads_sub=1):  # ***

        super(HierarchicalAttentionSeq2seq, self).__init__(
            input_size=input_size,
            encoder_type=encoder_type,
            encoder_bidirectional=encoder_bidirectional,
            encoder_num_units=encoder_num_units,
            encoder_num_proj=encoder_num_proj,
            encoder_num_layers=encoder_num_layers,
            attention_type=attention_type,
            attention_dim=attention_dim,
            decoder_type=decoder_type,
            decoder_num_units=decoder_num_units,
            decoder_num_layers=decoder_num_layers,
            embedding_dim=embedding_dim,
            dropout_input=dropout_input,
            dropout_encoder=dropout_encoder,
            dropout_decoder=dropout_decoder,
            dropout_embedding=dropout_embedding,
            num_classes=num_classes,
            parameter_init=parameter_init,
            subsample_list=subsample_list,
            subsample_type=subsample_type,
            bridge_layer=bridge_layer,
            init_dec_state=init_dec_state,
            sharpening_factor=sharpening_factor,
            logits_temperature=logits_temperature,
            sigmoid_smoothing=sigmoid_smoothing,
            coverage_weight=coverage_weight,
            ctc_loss_weight=0,
            attention_conv_num_channels=attention_conv_num_channels,
            attention_conv_width=attention_conv_width,
            num_stack=num_stack,
            splice=splice,
            input_channel=input_channel,
            conv_channels=conv_channels,
            conv_kernel_sizes=conv_kernel_sizes,
            conv_strides=conv_strides,
            poolings=poolings,
            scheduled_sampling_prob=scheduled_sampling_prob,
            scheduled_sampling_max_step=scheduled_sampling_max_step,
            label_smoothing_prob=label_smoothing_prob,
            weight_noise_std=weight_noise_std,
            encoder_residual=encoder_residual,
            encoder_dense_residual=encoder_dense_residual,
            decoder_residual=decoder_residual,
            decoder_dense_residual=decoder_dense_residual,
            decoding_order=decoding_order,
            bottleneck_dim=bottleneck_dim,
            backward_loss_weight=0,
            num_heads=num_heads)
        self.model_type = 'hierarchical_attention'

        # Setting for the encoder
        self.encoder_num_units_sub = encoder_num_units
        if encoder_bidirectional:
            self.encoder_num_units_sub *= 2

        # Setting for the decoder in the sub task
        self.decoder_num_units_1 = decoder_num_units_sub
        self.decoder_num_layers_1 = decoder_num_layers_sub
        self.num_classes_sub = num_classes_sub + 1  # Add <EOS> class
        self.sos_1 = num_classes_sub
        self.eos_1 = num_classes_sub
        # NOTE: <SOS> and <EOS> have the same index
        self.backward_1 = backward_sub

        # Setting for the decoder initialization in the sub task
        if backward_sub:
            if init_dec_state == 'first':
                self.init_dec_state_1_bwd = 'final'
            elif init_dec_state == 'final':
                self.init_dec_state_1_bwd = 'first'
            else:
                self.init_dec_state_1_bwd = init_dec_state
            if encoder_type != decoder_type:
                self.init_dec_state_1_bwd = 'zero'
        else:
            self.init_dec_state_1_fwd = init_dec_state
            if encoder_type != decoder_type:
                self.init_dec_state_1_fwd = 'zero'

        # Setting for the attention in the sub task
        self.num_heads_1 = num_heads_sub

        # Setting for MTL
        self.main_loss_weight = main_loss_weight
        self.sub_loss_weight = sub_loss_weight
        self.ctc_loss_weight_sub = ctc_loss_weight_sub
        if backward_sub:
            self.bwd_weight_1 = sub_loss_weight

        ##############################
        # Encoder
        # NOTE: overide encoder
        ##############################
        if encoder_type in ['lstm', 'gru', 'rnn']:
            self.encoder = load(encoder_type=encoder_type)(
                input_size=input_size,
                rnn_type=encoder_type,
                bidirectional=encoder_bidirectional,
                num_units=encoder_num_units,
                num_proj=encoder_num_proj,
                num_layers=encoder_num_layers,
                num_layers_sub=encoder_num_layers_sub,
                dropout_input=dropout_input,
                dropout_hidden=dropout_encoder,
                subsample_list=subsample_list,
                subsample_type=subsample_type,
                batch_first=True,
                merge_bidirectional=False,
                pack_sequence=True,
                num_stack=num_stack,
                splice=splice,
                input_channel=input_channel,
                conv_channels=conv_channels,
                conv_kernel_sizes=conv_kernel_sizes,
                conv_strides=conv_strides,
                poolings=poolings,
                activation=activation,
                batch_norm=batch_norm,
                residual=encoder_residual,
                dense_residual=encoder_dense_residual)
        elif encoder_type == 'cnn':
            assert num_stack == 1 and splice == 1
            self.encoder = load(encoder_type='cnn')(
                input_size=input_size,
                input_channel=input_channel,
                conv_channels=conv_channels,
                conv_kernel_sizes=conv_kernel_sizes,
                conv_strides=conv_strides,
                poolings=poolings,
                dropout_input=dropout_input,
                dropout_hidden=dropout_encoder,
                activation=activation,
                batch_norm=batch_norm)
            self.init_dec_state_0 = 'zero'
            self.init_dec_state_1 = 'zero'
        else:
            raise NotImplementedError

        dir = 'bwd' if backward_sub else 'fwd'
        self.is_bridge_sub = False
        if self.sub_loss_weight > 0:
            ##################################################
            # Bridge layer between the encoder and decoder
            ##################################################
            if encoder_type == 'cnn':
                self.bridge_1 = LinearND(self.encoder.output_size,
                                         decoder_num_units_sub,
                                         dropout=dropout_encoder)
                self.encoder_num_units_sub = decoder_num_units_sub
                self.is_bridge_sub = True
            elif bridge_layer:
                self.bridge_1 = LinearND(self.encoder_num_units_sub,
                                         decoder_num_units_sub,
                                         dropout=dropout_encoder)
                self.encoder_num_units_sub = decoder_num_units_sub
                self.is_bridge_sub = True
            else:
                self.is_bridge_sub = False

            ##################################################
            # Initialization of the decoder
            ##################################################
            if getattr(self, 'init_dec_state_1_' + dir) != 'zero':
                setattr(
                    self, 'W_dec_init_1_' + dir,
                    LinearND(self.encoder_num_units_sub,
                             decoder_num_units_sub))

            ##############################
            # Decoder (sub)
            ##############################
            if decoding_order == 'conditional':
                setattr(
                    self, 'decoder_first_1_' + dir,
                    RNNDecoder(input_size=embedding_dim_sub,
                               rnn_type=decoder_type,
                               num_units=decoder_num_units_sub,
                               num_layers=1,
                               dropout=dropout_decoder,
                               residual=False,
                               dense_residual=False))
                setattr(
                    self, 'decoder_second_1_' + dir,
                    RNNDecoder(input_size=self.encoder_num_units_sub,
                               rnn_type=decoder_type,
                               num_units=decoder_num_units_sub,
                               num_layers=1,
                               dropout=dropout_decoder,
                               residual=False,
                               dense_residual=False))
                # NOTE; the conditional decoder only supports the 1 layer
            else:
                setattr(
                    self, 'decoder_1_' + dir,
                    RNNDecoder(input_size=self.encoder_num_units_sub +
                               embedding_dim_sub,
                               rnn_type=decoder_type,
                               num_units=decoder_num_units_sub,
                               num_layers=decoder_num_layers_sub,
                               dropout=dropout_decoder,
                               residual=decoder_residual,
                               dense_residual=decoder_dense_residual))

            ###################################
            # Attention layer (sub)
            ###################################
            setattr(
                self, 'attend_1_' + dir,
                AttentionMechanism(
                    encoder_num_units=self.encoder_num_units_sub,
                    decoder_num_units=decoder_num_units_sub,
                    attention_type=attention_type,
                    attention_dim=attention_dim,
                    sharpening_factor=sharpening_factor,
                    sigmoid_smoothing=sigmoid_smoothing,
                    out_channels=attention_conv_num_channels,
                    kernel_size=attention_conv_width,
                    num_heads=num_heads_sub))

            ##############################
            # Output layer (sub)
            ##############################
            setattr(
                self, 'W_d_1_' + dir,
                LinearND(decoder_num_units_sub,
                         bottleneck_dim_sub,
                         dropout=dropout_decoder))
            setattr(
                self, 'W_c_1_' + dir,
                LinearND(self.encoder_num_units_sub,
                         bottleneck_dim_sub,
                         dropout=dropout_decoder))
            setattr(self, 'fc_1_' + dir,
                    LinearND(bottleneck_dim_sub, self.num_classes_sub))

            ##############################
            # Embedding (sub)
            ##############################
            if label_smoothing_prob > 0:
                self.embed_1 = Embedding_LS(
                    num_classes=self.num_classes_sub,
                    embedding_dim=embedding_dim_sub,
                    dropout=dropout_embedding,
                    label_smoothing_prob=label_smoothing_prob)
            else:
                self.embed_1 = Embedding(num_classes=self.num_classes_sub,
                                         embedding_dim=embedding_dim_sub,
                                         dropout=dropout_embedding,
                                         ignore_index=-1)

        ##############################
        # CTC (sub)
        ##############################
        if ctc_loss_weight_sub > 0:
            self.fc_ctc_1 = LinearND(self.encoder_num_units_sub,
                                     num_classes_sub + 1)

            # Set CTC decoders
            self._decode_ctc_greedy_np = GreedyDecoder(blank_index=0)
            self._decode_ctc_beam_np = BeamSearchDecoder(blank_index=0)
            # NOTE: index 0 is reserved for the blank class

        ##################################################
        # Initialize parameters
        ##################################################
        self.init_weights(parameter_init,
                          distribution=parameter_init_distribution,
                          ignore_keys=['bias'])

        # Initialize all biases with 0
        self.init_weights(0, distribution='constant', keys=['bias'])

        # Recurrent weights are orthogonalized
        if recurrent_weight_orthogonal:
            self.init_weights(parameter_init,
                              distribution='orthogonal',
                              keys=[encoder_type, 'weight'],
                              ignore_keys=['bias'])
            self.init_weights(parameter_init,
                              distribution='orthogonal',
                              keys=[decoder_type, 'weight'],
                              ignore_keys=['bias'])

        # Initialize bias in forget gate with 1
        if init_forget_gate_bias_with_one:
            self.init_forget_gate_bias_with_one()