Exemple #1
0
def register_args_decoder(parser, args, dec_type):
    if dec_type in ['transformer']:
        from neural_sp.models.seq2seq.decoders.transformer import TransformerDecoder as module
    elif dec_type in ['lstm_transducer', 'gru_transducer']:
        from neural_sp.models.seq2seq.decoders.rnn_transducer import RNNTransducer as module
    elif dec_type == 'asg':
        from neural_sp.models.seq2seq.decoders.asg import ASGDecoder as module
    else:
        from neural_sp.models.seq2seq.decoders.las import RNNDecoder as module
    if hasattr(module, 'add_args'):
        parser = module.add_args(parser, args)
    return parser
Exemple #2
0
def register_args_encoder(parser, args):
    if args.enc_type == 'tds':
        from neural_sp.models.seq2seq.encoders.tds import TDSEncoder as module
    elif args.enc_type == 'gated_conv':
        from neural_sp.models.seq2seq.encoders.gated_conv import GatedConvEncoder as module
    elif 'transformer' in args.enc_type:
        from neural_sp.models.seq2seq.encoders.transformer import TransformerEncoder as module
    else:
        from neural_sp.models.seq2seq.encoders.rnn import RNNEncoder as module
    if hasattr(module, 'add_args'):
        parser = module.add_args(parser, args)
    return parser
Exemple #3
0
def _define_lm_name(dir_name, args):
    if 'gated_conv' in args.lm_type:
        from neural_sp.models.lm.gated_convlm import GatedConvLM as module
    elif args.lm_type == 'transformer':
        from neural_sp.models.lm.transformerlm import TransformerLM as module
    elif args.lm_type == 'transformer_xl':
        from neural_sp.models.lm.transformer_xl import TransformerXL as module
    else:
        from neural_sp.models.lm.rnnlm import RNNLM as module
    if hasattr(module, 'define_name'):
        dir_name = module.define_name(dir_name, args)
    else:
        raise NotImplementedError(module)
    return dir_name
Exemple #4
0
def _define_encoder_name(dir_name, args):
    if args.enc_type == 'tds':
        from neural_sp.models.seq2seq.encoders.tds import TDSEncoder as module
    elif args.enc_type == 'gated_conv':
        from neural_sp.models.seq2seq.encoders.gated_conv import GatedConvEncoder as module
    elif 'transformer' in args.enc_type:
        from neural_sp.models.seq2seq.encoders.transformer import TransformerEncoder as module
    elif 'conformer' in args.enc_type:
        from neural_sp.models.seq2seq.encoders.conformer import ConformerEncoder as module
    else:
        from neural_sp.models.seq2seq.encoders.rnn import RNNEncoder as module
    if hasattr(module, 'define_name'):
        dir_name = module.define_name(dir_name, args)
    else:
        raise NotImplementedError(module)
    return dir_name
Exemple #5
0
def _define_decoder_name(dir_name, args):
    if args.dec_type in ['transformer', 'transformer_xl']:
        from neural_sp.models.seq2seq.decoders.transformer import TransformerDecoder as module
    elif args.dec_type in [
            'transformer_transducer', 'transformer_transducer_xl'
    ]:
        from neural_sp.models.seq2seq.decoders.transformer_transducer import TransformerTransducer as module
    elif args.dec_type in ['lstm_transducer', 'gru_transducer']:
        from neural_sp.models.seq2seq.decoders.rnn_transducer import RNNTransducer as module
    elif args.dec_type == 'asg':
        from neural_sp.models.seq2seq.decoders.asg import ASGDecoder as module
    else:
        from neural_sp.models.seq2seq.decoders.las import RNNDecoder as module
    if hasattr(module, 'define_name'):
        dir_name = module.define_name(dir_name, args)
    else:
        raise NotImplementedError(module)
    return dir_name
Exemple #6
0
    def __init__(self, input_dim, rnn_type, n_units, n_projs, last_proj_dim,
                 n_layers, n_layers_sub1, n_layers_sub2,
                 dropout_in, dropout,
                 subsample, subsample_type, n_stacks, n_splices,
                 conv_in_channel, conv_channels, conv_kernel_sizes, conv_strides, conv_poolings,
                 conv_batch_norm, conv_layer_norm, conv_bottleneck_dim,
                 bidirectional_sum_fwd_bwd, task_specific_layer, param_init,
                 chunk_size_left, chunk_size_right):

        super(RNNEncoder, self).__init__()

        # parse subsample
        subsample_list = [1] * n_layers
        for lth, s in enumerate(list(map(int, subsample.split('_')[:n_layers]))):
            subsample_list[lth] = s

        if len(subsample_list) > 0 and len(subsample_list) != n_layers:
            raise ValueError('subsample must be the same size as n_layers. n_layers: %d, subsample: %s' %
                             (n_layers, subsample_list))
        if n_layers_sub1 < 0 or (n_layers_sub1 > 1 and n_layers < n_layers_sub1):
            raise ValueError('Set n_layers_sub1 between 1 to n_layers. n_layers: %d, n_layers_sub1: %d' %
                             (n_layers, n_layers_sub1))
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1 and n_layers_sub1 < n_layers_sub2):
            raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1. n_layers_sub1: %d, n_layers_sub2: %d' %
                             (n_layers_sub1, n_layers_sub2))

        self.rnn_type = rnn_type
        self.bidirectional = True if ('blstm' in rnn_type or 'bgru' in rnn_type) else False
        self.n_units = n_units
        self.n_dirs = 2 if self.bidirectional else 1
        self.n_layers = n_layers
        self.bidir_sum = bidirectional_sum_fwd_bwd

        # for latency-controlled
        self.latency_controlled = chunk_size_left > 0 or chunk_size_right > 0
        self.chunk_size_left = chunk_size_left
        self.chunk_size_right = chunk_size_right
        if self.latency_controlled:
            assert n_layers_sub2 == 0

        # for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # Dropout for input-hidden connection
        self.dropout_in = nn.Dropout(p=dropout_in)

        if rnn_type == 'tds':
            self.conv = TDSEncoder(input_dim=input_dim * n_stacks,
                                   in_channel=conv_in_channel,
                                   channels=conv_channels,
                                   kernel_sizes=conv_kernel_sizes,
                                   dropout=dropout,
                                   bottleneck_dim=last_proj_dim)
        elif rnn_type == 'gated_conv':
            self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks,
                                         in_channel=conv_in_channel,
                                         channels=conv_channels,
                                         kernel_sizes=conv_kernel_sizes,
                                         dropout=dropout,
                                         bottleneck_dim=last_proj_dim,
                                         param_init=param_init)

        elif 'conv' in rnn_type:
            assert n_stacks == 1 and n_splices == 1
            self.conv = ConvEncoder(input_dim,
                                    in_channel=conv_in_channel,
                                    channels=conv_channels,
                                    kernel_sizes=conv_kernel_sizes,
                                    strides=conv_strides,
                                    poolings=conv_poolings,
                                    dropout=0.,
                                    batch_norm=conv_batch_norm,
                                    layer_norm=conv_layer_norm,
                                    residual=False,
                                    bottleneck_dim=conv_bottleneck_dim,
                                    param_init=param_init)
        else:
            self.conv = None

        if self.conv is None:
            self._odim = input_dim * n_splices * n_stacks
        else:
            self._odim = self.conv.output_dim
            subsample_list = [1] * self.n_layers
            logger.warning('Subsampling is automatically ignored because CNN layers are used before RNN layers.')

        self.padding = Padding(bidirectional_sum_fwd_bwd=bidirectional_sum_fwd_bwd)

        if rnn_type not in ['conv', 'tds', 'gated_conv']:
            self.rnn = nn.ModuleList()
            if self.latency_controlled:
                self.rnn_bwd = nn.ModuleList()
            self.dropout = nn.Dropout(p=dropout)
            self.proj = None
            if n_projs > 0:
                self.proj = nn.ModuleList()

            # subsample
            self.subsample_layer = None
            if subsample_type == 'max_pool' and np.prod(subsample_list) > 1:
                self.subsample_layer = nn.ModuleList([MaxpoolSubsampler(subsample_list[lth])
                                                      for lth in range(n_layers)])
            elif subsample_type == 'concat' and np.prod(subsample_list) > 1:
                self.subsample_layer = nn.ModuleList([ConcatSubsampler(subsample_list[lth], n_units * self.n_dirs)
                                                      for lth in range(n_layers)])
            elif subsample_type == 'drop' and np.prod(subsample_list) > 1:
                self.subsample_layer = nn.ModuleList([DropSubsampler(subsample_list[lth])
                                                      for lth in range(n_layers)])
            elif subsample_type == '1dconv' and np.prod(subsample_list) > 1:
                self.subsample_layer = nn.ModuleList([Conv1dSubsampler(subsample_list[lth], n_units * self.n_dirs)
                                                      for lth in range(n_layers)])

            for lth in range(n_layers):
                if 'lstm' in rnn_type:
                    rnn_i = nn.LSTM
                elif 'gru' in rnn_type:
                    rnn_i = nn.GRU
                else:
                    raise ValueError('rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".')

                if self.latency_controlled:
                    self.rnn += [rnn_i(self._odim, n_units, 1, batch_first=True)]
                    self.rnn_bwd += [rnn_i(self._odim, n_units, 1, batch_first=True)]
                else:
                    self.rnn += [rnn_i(self._odim, n_units, 1, batch_first=True,
                                       bidirectional=self.bidirectional)]
                self._odim = n_units if bidirectional_sum_fwd_bwd else n_units * self.n_dirs

                # Projection layer
                if self.proj is not None:
                    if lth != n_layers - 1:
                        self.proj += [nn.Linear(n_units * self.n_dirs, n_projs)]
                        self._odim = n_projs

                # Task specific layer
                if lth == n_layers_sub1 - 1 and task_specific_layer:
                    assert not self.latency_controlled
                    self.rnn_sub1 = rnn_i(self._odim, n_units, 1,
                                          batch_first=True,
                                          bidirectional=self.bidirectional)
                    if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                        self.bridge_sub1 = nn.Linear(n_units, last_proj_dim)
                if lth == n_layers_sub2 - 1 and task_specific_layer:
                    assert not self.latency_controlled
                    self.rnn_sub2 = rnn_i(self._odim, n_units, 1,
                                          batch_first=True,
                                          bidirectional=self.bidirectional)
                    if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                        self.bridge_sub2 = nn.Linear(n_units, last_proj_dim)

            if last_proj_dim > 0 and last_proj_dim != self.output_dim:
                self.bridge = nn.Linear(self._odim, last_proj_dim)
                self._odim = last_proj_dim

        # calculate subsampling factor
        self._factor = 1
        if self.conv is not None:
            self._factor *= self.conv.subsampling_factor
        self._factor *= np.prod(subsample_list)

        self.reset_parameters(param_init)

        # for streaming inference
        self.reset_cache()
Exemple #7
0
def build_encoder(args):

    # safeguard
    if not hasattr(args, 'transformer_enc_d_model') and hasattr(args, 'transformer_d_model'):
        args.transformer_enc_d_model = args.transformer_d_model
        args.transformer_dec_d_model = args.transformer_d_model
    if not hasattr(args, 'transformer_enc_d_ff') and hasattr(args, 'transformer_d_ff'):
        args.transformer_enc_d_ff = args.transformer_d_ff
    if not hasattr(args, 'transformer_enc_n_heads') and hasattr(args, 'transformer_n_heads'):
        args.transformer_enc_n_heads = args.transformer_n_heads

    if args.enc_type == 'tds':
        from neural_sp.models.seq2seq.encoders.tds import TDSEncoder
        encoder = TDSEncoder(
            input_dim=args.input_dim * args.n_stacks,
            in_channel=args.conv_in_channel,
            channels=args.conv_channels,
            kernel_sizes=args.conv_kernel_sizes,
            dropout=args.dropout_enc,
            last_proj_dim=args.transformer_dec_d_model if 'transformer' in args.dec_type else args.dec_n_units)

    elif args.enc_type == 'gated_conv':
        from neural_sp.models.seq2seq.encoders.gated_conv import GatedConvEncoder
        raise ValueError
        encoder = GatedConvEncoder(
            input_dim=args.input_dim * args.n_stacks,
            in_channel=args.conv_in_channel,
            channels=args.conv_channels,
            kernel_sizes=args.conv_kernel_sizes,
            dropout=args.dropout_enc,
            last_proj_dim=args.transformer_dec_d_model if 'transformer' in args.dec_type else args.dec_n_units,
            param_init=args.param_init)

    elif 'transformer' in args.enc_type:
        from neural_sp.models.seq2seq.encoders.transformer import TransformerEncoder
        encoder = TransformerEncoder(
            input_dim=args.input_dim if args.input_type == 'speech' else args.emb_dim,
            enc_type=args.enc_type,
            n_heads=args.transformer_enc_n_heads,
            n_layers=args.enc_n_layers,
            n_layers_sub1=args.enc_n_layers_sub1,
            n_layers_sub2=args.enc_n_layers_sub2,
            d_model=args.transformer_enc_d_model,
            d_ff=args.transformer_enc_d_ff,
            ffn_bottleneck_dim=args.transformer_ffn_bottleneck_dim,
            ffn_activation=args.transformer_ffn_activation,
            pe_type=args.transformer_enc_pe_type,
            layer_norm_eps=args.transformer_layer_norm_eps,
            last_proj_dim=args.transformer_dec_d_model if 'transformer' in args.dec_type else 0,
            dropout_in=args.dropout_in,
            dropout=args.dropout_enc,
            dropout_att=args.dropout_att,
            dropout_layer=args.dropout_enc_layer,
            subsample=args.subsample,
            subsample_type=args.subsample_type,
            n_stacks=args.n_stacks,
            n_splices=args.n_splices,
            conv_in_channel=args.conv_in_channel,
            conv_channels=args.conv_channels,
            conv_kernel_sizes=args.conv_kernel_sizes,
            conv_strides=args.conv_strides,
            conv_poolings=args.conv_poolings,
            conv_batch_norm=args.conv_batch_norm,
            conv_layer_norm=args.conv_layer_norm,
            conv_bottleneck_dim=args.conv_bottleneck_dim,
            conv_param_init=args.param_init,
            task_specific_layer=args.task_specific_layer,
            param_init=args.transformer_param_init,
            clamp_len=args.transformer_enc_clamp_len,
            lookahead=args.transformer_enc_lookaheads,
            chunk_size_left=args.lc_chunk_size_left,
            chunk_size_current=args.lc_chunk_size_current,
            chunk_size_right=args.lc_chunk_size_right,
            streaming_type=args.lc_type)

    elif 'conformer' in args.enc_type:
        from neural_sp.models.seq2seq.encoders.conformer import ConformerEncoder
        encoder = ConformerEncoder(
            input_dim=args.input_dim if args.input_type == 'speech' else args.emb_dim,
            enc_type=args.enc_type,
            n_heads=args.transformer_enc_n_heads,
            kernel_size=args.conformer_kernel_size,
            n_layers=args.enc_n_layers,
            n_layers_sub1=args.enc_n_layers_sub1,
            n_layers_sub2=args.enc_n_layers_sub2,
            d_model=args.transformer_enc_d_model,
            d_ff=args.transformer_enc_d_ff,
            ffn_bottleneck_dim=args.transformer_ffn_bottleneck_dim,
            ffn_activation='swish',
            pe_type=args.transformer_enc_pe_type,
            layer_norm_eps=args.transformer_layer_norm_eps,
            last_proj_dim=args.transformer_dec_d_model if 'transformer' in args.dec_type else 0,
            dropout_in=args.dropout_in,
            dropout=args.dropout_enc,
            dropout_att=args.dropout_att,
            dropout_layer=args.dropout_enc_layer,
            subsample=args.subsample,
            subsample_type=args.subsample_type,
            n_stacks=args.n_stacks,
            n_splices=args.n_splices,
            conv_in_channel=args.conv_in_channel,
            conv_channels=args.conv_channels,
            conv_kernel_sizes=args.conv_kernel_sizes,
            conv_strides=args.conv_strides,
            conv_poolings=args.conv_poolings,
            conv_batch_norm=args.conv_batch_norm,
            conv_layer_norm=args.conv_layer_norm,
            conv_bottleneck_dim=args.conv_bottleneck_dim,
            conv_param_init=args.param_init,
            task_specific_layer=args.task_specific_layer,
            param_init=args.transformer_param_init,
            clamp_len=args.transformer_enc_clamp_len,
            lookahead=args.transformer_enc_lookaheads,
            chunk_size_left=args.lc_chunk_size_left,
            chunk_size_current=args.lc_chunk_size_current,
            chunk_size_right=args.lc_chunk_size_right,
            streaming_type=args.lc_type)

    else:
        from neural_sp.models.seq2seq.encoders.rnn import RNNEncoder
        encoder = RNNEncoder(
            input_dim=args.input_dim if args.input_type == 'speech' else args.emb_dim,
            enc_type=args.enc_type,
            n_units=args.enc_n_units,
            n_projs=args.enc_n_projs,
            last_proj_dim=args.transformer_dec_d_model if 'transformer' in args.dec_type else 0,
            n_layers=args.enc_n_layers,
            n_layers_sub1=args.enc_n_layers_sub1,
            n_layers_sub2=args.enc_n_layers_sub2,
            dropout_in=args.dropout_in,
            dropout=args.dropout_enc,
            subsample=args.subsample,
            subsample_type=args.subsample_type,
            n_stacks=args.n_stacks,
            n_splices=args.n_splices,
            conv_in_channel=args.conv_in_channel,
            conv_channels=args.conv_channels,
            conv_kernel_sizes=args.conv_kernel_sizes,
            conv_strides=args.conv_strides,
            conv_poolings=args.conv_poolings,
            conv_batch_norm=args.conv_batch_norm,
            conv_layer_norm=args.conv_layer_norm,
            conv_bottleneck_dim=args.conv_bottleneck_dim,
            bidir_sum_fwd_bwd=args.bidirectional_sum_fwd_bwd,
            task_specific_layer=args.task_specific_layer,
            param_init=args.param_init,
            chunk_size_left=args.lc_chunk_size_left,
            chunk_size_right=args.lc_chunk_size_right,
            rsp_prob=args.rsp_prob_enc)

    return encoder
Exemple #8
0
def build_encoder(args):

    if args.enc_type == 'tds':
        from neural_sp.models.seq2seq.encoders.tds import TDSEncoder
        raise ValueError
        encoder = TDSEncoder(
            input_dim=args.input_dim * args.n_stacks,
            in_channel=args.conv_in_channel,
            channels=args.conv_channels,
            kernel_sizes=args.conv_kernel_sizes,
            dropout=args.dropout_enc,
            bottleneck_dim=args.transformer_d_model
            if 'transformer' in args.dec_type else args.dec_n_units)

    elif args.enc_type == 'gated_conv':
        from neural_sp.models.seq2seq.encoders.gated_conv import GatedConvEncoder
        raise ValueError
        encoder = GatedConvEncoder(
            input_dim=args.input_dim * args.n_stacks,
            in_channel=args.conv_in_channel,
            channels=args.conv_channels,
            kernel_sizes=args.conv_kernel_sizes,
            dropout=args.dropout_enc,
            bottleneck_dim=args.transformer_d_model
            if 'transformer' in args.dec_type else args.dec_n_units,
            param_init=args.param_init)

    elif 'transformer' in args.enc_type:
        from neural_sp.models.seq2seq.encoders.transformer import TransformerEncoder
        encoder = TransformerEncoder(
            input_dim=args.input_dim
            if args.input_type == 'speech' else args.emb_dim,
            enc_type=args.enc_type,
            attn_type=args.transformer_attn_type,
            n_heads=args.transformer_n_heads,
            n_layers=args.enc_n_layers,
            n_layers_sub1=args.enc_n_layers_sub1,
            n_layers_sub2=args.enc_n_layers_sub2,
            d_model=args.transformer_d_model,
            d_ff=args.transformer_d_ff,
            last_proj_dim=args.transformer_d_model
            if 'transformer' in args.dec_type else 0,
            pe_type=args.transformer_enc_pe_type,
            layer_norm_eps=args.transformer_layer_norm_eps,
            ffn_activation=args.transformer_ffn_activation,
            dropout_in=args.dropout_in,
            dropout=args.dropout_enc,
            dropout_att=args.dropout_att,
            dropout_layer=args.dropout_enc_layer,
            n_stacks=args.n_stacks,
            n_splices=args.n_splices,
            conv_in_channel=args.conv_in_channel,
            conv_channels=args.conv_channels,
            conv_kernel_sizes=args.conv_kernel_sizes,
            conv_strides=args.conv_strides,
            conv_poolings=args.conv_poolings,
            conv_batch_norm=args.conv_batch_norm,
            conv_layer_norm=args.conv_layer_norm,
            conv_bottleneck_dim=args.conv_bottleneck_dim,
            conv_param_init=args.param_init,
            task_specific_layer=args.task_specific_layer,
            param_init=args.transformer_param_init,
            chunk_size_left=args.lc_chunk_size_left,
            chunk_size_current=args.lc_chunk_size_current,
            chunk_size_right=args.lc_chunk_size_right)

    else:
        subsample = [1] * args.enc_n_layers
        for l, s in enumerate(
                list(map(int,
                         args.subsample.split('_')[:args.enc_n_layers]))):
            subsample[l] = s

        from neural_sp.models.seq2seq.encoders.rnn import RNNEncoder
        encoder = RNNEncoder(
            input_dim=args.input_dim
            if args.input_type == 'speech' else args.emb_dim,
            rnn_type=args.enc_type,
            n_units=args.enc_n_units,
            n_projs=args.enc_n_projs,
            last_proj_dim=args.transformer_d_model
            if 'transformer' in args.dec_type else 0,
            n_layers=args.enc_n_layers,
            n_layers_sub1=args.enc_n_layers_sub1,
            n_layers_sub2=args.enc_n_layers_sub2,
            dropout_in=args.dropout_in,
            dropout=args.dropout_enc,
            subsample=subsample,
            subsample_type=args.subsample_type,
            n_stacks=args.n_stacks,
            n_splices=args.n_splices,
            conv_in_channel=args.conv_in_channel,
            conv_channels=args.conv_channels,
            conv_kernel_sizes=args.conv_kernel_sizes,
            conv_strides=args.conv_strides,
            conv_poolings=args.conv_poolings,
            conv_batch_norm=args.conv_batch_norm,
            conv_layer_norm=args.conv_layer_norm,
            conv_bottleneck_dim=args.conv_bottleneck_dim,
            bidirectional_sum_fwd_bwd=args.bidirectional_sum_fwd_bwd,
            task_specific_layer=args.task_specific_layer,
            param_init=args.param_init,
            chunk_size_left=args.lc_chunk_size_left,
            chunk_size_right=args.lc_chunk_size_right)
        # NOTE: pure Conv/TDS/GatedConv encoders are also included

    return encoder
Exemple #9
0
    def __init__(self,
                 input_dim,
                 rnn_type,
                 n_units,
                 n_projs,
                 n_layers,
                 dropout_in,
                 dropout,
                 subsample,
                 subsample_type='drop',
                 n_stacks=1,
                 n_splices=1,
                 last_proj_dim=0,
                 conv_in_channel=1,
                 conv_channels=0,
                 conv_kernel_sizes=[],
                 conv_strides=[],
                 conv_poolings=[],
                 conv_batch_norm=False,
                 conv_residual=False,
                 conv_bottleneck_dim=0,
                 residual=False,
                 n_layers_sub1=0,
                 n_layers_sub2=0,
                 nin=False,
                 task_specific_layer=False,
                 param_init=0.1):

        super(RNNEncoder, self).__init__()

        logger = logging.getLogger("training")

        if len(subsample) > 0 and len(subsample) != n_layers:
            raise ValueError('subsample must be the same size as n_layers.')
        if n_layers_sub1 < 0 or (n_layers_sub1 > 1
                                 and n_layers < n_layers_sub1):
            raise ValueError('Set n_layers_sub1 between 1 to n_layers.')
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1
                                 and n_layers_sub1 < n_layers_sub2):
            raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.')

        self.rnn_type = rnn_type
        self.bidirectional = True if rnn_type in [
            'blstm', 'bgru', 'conv_blstm', 'conv_bgru'
        ] else False
        self.n_units = n_units
        self.n_dirs = 2 if self.bidirectional else 1
        self.n_projs = n_projs
        self.n_layers = n_layers

        # Setting for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # Setting for subsampling
        self.subsample = subsample
        self.subsample_type = subsample_type

        # Setting for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # Setting for residual connections
        self.residual = residual
        if residual:
            assert np.prod(subsample) == 1

        # Setting for the NiN (Network in Network)
        self.nin = nin

        # Dropout for input-hidden connection
        self.dropout_in = nn.Dropout(p=dropout_in)

        # Setting for CNNs before RNNs
        if conv_channels and rnn_type not in ['blstm', 'lstm', 'bgru', 'gru']:
            channels = [int(c) for c in conv_channels.split('_')
                        ] if len(conv_channels) > 0 else []
            kernel_sizes = [[
                int(c.split(',')[0].replace('(', '')),
                int(c.split(',')[1].replace(')', ''))
            ] for c in conv_kernel_sizes.split('_')
                            ] if len(conv_kernel_sizes) > 0 else []
            if rnn_type in ['tds', 'gated_conv']:
                strides = []
                poolings = []
            else:
                strides = [[
                    int(c.split(',')[0].replace('(', '')),
                    int(c.split(',')[1].replace(')', ''))
                ] for c in conv_strides.split('_')
                           ] if len(conv_strides) > 0 else []
                poolings = [[
                    int(c.split(',')[0].replace('(', '')),
                    int(c.split(',')[1].replace(')', ''))
                ] for c in conv_poolings.split('_')
                            ] if len(conv_poolings) > 0 else []
            if 'conv_' in rnn_type:
                self.subsample = [1] * self.n_layers
                logger.warning(
                    'Subsampling is automatically ignored because CNN layers are used before RNN layers.'
                )
        else:
            channels = []
            kernel_sizes = []
            strides = []
            poolings = []

        if len(channels) > 0:
            if rnn_type == 'tds':
                self.conv = TDSEncoder(input_dim=input_dim * n_stacks,
                                       in_channel=conv_in_channel,
                                       channels=channels,
                                       kernel_sizes=kernel_sizes,
                                       dropout=dropout,
                                       bottleneck_dim=last_proj_dim)
            elif rnn_type == 'gated_conv':
                self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks,
                                             in_channel=conv_in_channel,
                                             channels=channels,
                                             kernel_sizes=kernel_sizes,
                                             dropout=dropout,
                                             bottleneck_dim=last_proj_dim,
                                             param_init=param_init)
            else:
                assert n_stacks == 1 and n_splices == 1
                self.conv = ConvEncoder(input_dim,
                                        in_channel=conv_in_channel,
                                        channels=channels,
                                        kernel_sizes=kernel_sizes,
                                        strides=strides,
                                        poolings=poolings,
                                        dropout=0,
                                        batch_norm=conv_batch_norm,
                                        residual=conv_residual,
                                        bottleneck_dim=conv_bottleneck_dim,
                                        param_init=param_init)
            self._output_dim = self.conv.output_dim
        else:
            self._output_dim = input_dim * n_splices * n_stacks
            self.conv = None

        if rnn_type not in ['conv', 'tds', 'gated_conv']:
            # Fast implementation without processes between each layer
            self.fast_impl = False
            if np.prod(
                    self.subsample
            ) == 1 and self.n_projs == 0 and not residual and n_layers_sub1 == 0 and not nin:
                self.fast_impl = True
                if 'lstm' in rnn_type:
                    rnn = nn.LSTM
                elif 'gru' in rnn_type:
                    rnn = nn.GRU
                else:
                    raise ValueError(
                        'rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".'
                    )

                self.rnn = rnn(self._output_dim,
                               n_units,
                               n_layers,
                               bias=True,
                               batch_first=True,
                               dropout=dropout,
                               bidirectional=self.bidirectional)
                # NOTE: pytorch introduces a dropout layer on the outputs of each layer EXCEPT the last layer
                self._output_dim = n_units * self.n_dirs
                self.dropout_top = nn.Dropout(p=dropout)
            else:
                self.rnn = nn.ModuleList()
                self.dropout = nn.ModuleList()
                if self.n_projs > 0:
                    self.proj = nn.ModuleList()
                if subsample_type == 'max_pool' and np.prod(
                        self.subsample) > 1:
                    self.max_pool = nn.ModuleList()
                    for l in range(n_layers):
                        if self.subsample[l] > 1:
                            self.max_pool += [
                                nn.MaxPool2d((1, 1),
                                             stride=(self.subsample[l], 1),
                                             ceil_mode=True)
                            ]
                        else:
                            self.max_pool += [None]
                if subsample_type == 'concat' and np.prod(self.subsample) > 1:
                    self.concat_proj = nn.ModuleList()
                    self.concat_bn = nn.ModuleList()
                    for l in range(n_layers):
                        if self.subsample[l] > 1:
                            self.concat_proj += [
                                LinearND(
                                    n_units * self.n_dirs * self.subsample[l],
                                    n_units * self.n_dirs)
                            ]
                            self.concat_bn += [
                                nn.BatchNorm1d(n_units * self.n_dirs)
                            ]
                        else:
                            self.concat_proj += [None]
                            self.concat_bn += [None]
                if nin:
                    self.nin_conv = nn.ModuleList()
                    self.nin_bn = nn.ModuleList()

                for l in range(n_layers):
                    if 'lstm' in rnn_type:
                        rnn_i = nn.LSTM
                    elif 'gru' in rnn_type:
                        rnn_i = nn.GRU
                    else:
                        raise ValueError(
                            'rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".'
                        )

                    self.rnn += [
                        rnn_i(self._output_dim,
                              n_units,
                              1,
                              bias=True,
                              batch_first=True,
                              dropout=0,
                              bidirectional=self.bidirectional)
                    ]
                    self.dropout += [nn.Dropout(p=dropout)]
                    self._output_dim = n_units * self.n_dirs

                    # Projection layer
                    if n_projs > 0 and l != n_layers - 1:
                        self.proj += [LinearND(n_units * self.n_dirs, n_projs)]
                        self._output_dim = n_projs

                    # Task specific layer
                    if l == n_layers_sub1 - 1 and task_specific_layer:
                        self.rnn_sub1 = rnn_i(self._output_dim,
                                              n_units,
                                              1,
                                              bias=True,
                                              batch_first=True,
                                              dropout=0,
                                              bidirectional=self.bidirectional)
                        self.dropout_sub1 = nn.Dropout(p=dropout)
                        if last_proj_dim != self.output_dim:
                            self.bridge_sub1 = LinearND(n_units,
                                                        last_proj_dim,
                                                        dropout=dropout)
                    if l == n_layers_sub2 - 1 and task_specific_layer:
                        self.rnn_sub2 = rnn_i(self._output_dim,
                                              n_units,
                                              1,
                                              bias=True,
                                              batch_first=True,
                                              dropout=0,
                                              bidirectional=self.bidirectional)
                        self.dropout_sub2 = nn.Dropout(p=dropout)
                        if last_proj_dim != self.output_dim:
                            self.bridge_sub2 = LinearND(n_units,
                                                        last_proj_dim,
                                                        dropout=dropout)

                    # Network in network (1*1 conv + batch normalization + ReLU)
                    # NOTE: exclude the last layer
                    if nin and l != n_layers - 1:
                        self.nin_conv += [
                            nn.Conv2d(in_channels=self._output_dim,
                                      out_channels=self._output_dim,
                                      kernel_size=1,
                                      stride=1,
                                      padding=0)
                        ]
                        self.nin_bn += [nn.BatchNorm2d(self._output_dim)]
                        if n_layers_sub1 > 0 or n_layers_sub2 > 0:
                            assert task_specific_layer

                if last_proj_dim != self.output_dim:
                    self.bridge = LinearND(self._output_dim,
                                           last_proj_dim,
                                           dropout=dropout)
                    self._output_dim = last_proj_dim

        # Initialize parameters
        self.reset_parameters(param_init)
Exemple #10
0
    def __init__(self,
                 input_dim,
                 rnn_type,
                 n_units,
                 n_projs,
                 n_layers,
                 dropout_in,
                 dropout,
                 subsample,
                 subsample_type='drop',
                 n_stacks=1,
                 n_splices=1,
                 last_proj_dim=0,
                 conv_in_channel=1,
                 conv_channels=0,
                 conv_kernel_sizes=[],
                 conv_strides=[],
                 conv_poolings=[],
                 conv_batch_norm=False,
                 conv_bottleneck_dim=0,
                 n_layers_sub1=0,
                 n_layers_sub2=0,
                 nin=False,
                 task_specific_layer=False,
                 param_init=0.1):

        super(RNNEncoder, self).__init__()
        logger = logging.getLogger("training")

        if len(subsample) > 0 and len(subsample) != n_layers:
            raise ValueError('subsample must be the same size as n_layers.')
        if n_layers_sub1 < 0 or (n_layers_sub1 > 1
                                 and n_layers < n_layers_sub1):
            raise ValueError('Set n_layers_sub1 between 1 to n_layers.')
        if n_layers_sub2 < 0 or (n_layers_sub2 > 1
                                 and n_layers_sub1 < n_layers_sub2):
            raise ValueError('Set n_layers_sub2 between 1 to n_layers_sub1.')

        self.rnn_type = rnn_type
        self.bidirectional = True if rnn_type in [
            'blstm', 'bgru', 'conv_blstm', 'conv_bgru'
        ] else False
        self.n_units = n_units
        self.n_dirs = 2 if self.bidirectional else 1
        self.n_layers = n_layers

        # Setting for hierarchical encoder
        self.n_layers_sub1 = n_layers_sub1
        self.n_layers_sub2 = n_layers_sub2
        self.task_specific_layer = task_specific_layer

        # Setting for bridge layers
        self.bridge = None
        self.bridge_sub1 = None
        self.bridge_sub2 = None

        # Dropout for input-hidden connection
        self.dropout_in = nn.Dropout(p=dropout_in)

        # Setting for CNNs before RNNs
        if conv_channels and rnn_type not in ['blstm', 'lstm', 'bgru', 'gru']:
            channels = [int(c) for c in conv_channels.split('_')
                        ] if len(conv_channels) > 0 else []
            kernel_sizes = [[
                int(c.split(',')[0].replace('(', '')),
                int(c.split(',')[1].replace(')', ''))
            ] for c in conv_kernel_sizes.split('_')
                            ] if len(conv_kernel_sizes) > 0 else []
            if rnn_type in ['tds', 'gated_conv']:
                strides = []
                poolings = []
            else:
                strides = [[
                    int(c.split(',')[0].replace('(', '')),
                    int(c.split(',')[1].replace(')', ''))
                ] for c in conv_strides.split('_')
                           ] if len(conv_strides) > 0 else []
                poolings = [[
                    int(c.split(',')[0].replace('(', '')),
                    int(c.split(',')[1].replace(')', ''))
                ] for c in conv_poolings.split('_')
                            ] if len(conv_poolings) > 0 else []
            if 'conv_' in rnn_type:
                subsample = [1] * self.n_layers
                logger.warning(
                    'Subsampling is automatically ignored because CNN layers are used before RNN layers.'
                )
        else:
            channels = []
            kernel_sizes = []
            strides = []
            poolings = []

        if len(channels) > 0:
            if rnn_type == 'tds':
                self.conv = TDSEncoder(input_dim=input_dim * n_stacks,
                                       in_channel=conv_in_channel,
                                       channels=channels,
                                       kernel_sizes=kernel_sizes,
                                       dropout=dropout,
                                       bottleneck_dim=last_proj_dim)
            elif rnn_type == 'gated_conv':
                self.conv = GatedConvEncoder(input_dim=input_dim * n_stacks,
                                             in_channel=conv_in_channel,
                                             channels=channels,
                                             kernel_sizes=kernel_sizes,
                                             dropout=dropout,
                                             bottleneck_dim=last_proj_dim,
                                             param_init=param_init)
            else:
                assert n_stacks == 1 and n_splices == 1
                self.conv = ConvEncoder(input_dim,
                                        in_channel=conv_in_channel,
                                        channels=channels,
                                        kernel_sizes=kernel_sizes,
                                        strides=strides,
                                        poolings=poolings,
                                        dropout=0,
                                        batch_norm=conv_batch_norm,
                                        bottleneck_dim=conv_bottleneck_dim,
                                        param_init=param_init)
            self._output_dim = self.conv.output_dim
        else:
            self._output_dim = input_dim * n_splices * n_stacks
            self.conv = None

        self.padding = Padding()

        if rnn_type not in ['conv', 'tds', 'gated_conv']:
            self.rnn = nn.ModuleList()
            self.dropout = nn.ModuleList()
            self.proj = None
            if n_projs > 0:
                self.proj = nn.ModuleList()

            # subsample
            self.subsample = None
            if subsample_type == 'max_pool' and np.prod(subsample) > 1:
                self.subsample = nn.ModuleList(
                    [MaxpoolSubsampler(subsample[l]) for l in range(n_layers)])
            elif subsample_type == 'concat' and np.prod(subsample) > 1:
                self.subsample = nn.ModuleList([
                    ConcatSubsampler(subsample[l], n_units, self.n_dirs)
                    for l in range(n_layers)
                ])
            elif subsample_type == 'drop' and np.prod(subsample) > 1:
                self.subsample = nn.ModuleList(
                    [DropSubsampler(subsample[l]) for l in range(n_layers)])

            # NiN
            self.nin = None
            if nin:
                self.nin = nn.ModuleList()

            for l in range(n_layers):
                if 'lstm' in rnn_type:
                    rnn_i = nn.LSTM
                elif 'gru' in rnn_type:
                    rnn_i = nn.GRU
                else:
                    raise ValueError(
                        'rnn_type must be "(conv_)(b)lstm" or "(conv_)(b)gru".'
                    )

                self.rnn += [
                    rnn_i(self._output_dim,
                          n_units,
                          1,
                          bias=True,
                          batch_first=True,
                          dropout=0,
                          bidirectional=self.bidirectional)
                ]
                self.dropout += [nn.Dropout(p=dropout)]
                self._output_dim = n_units * self.n_dirs

                # Projection layer
                if self.proj is not None:
                    if l != n_layers - 1:
                        self.proj += [Linear(n_units * self.n_dirs, n_projs)]
                        self._output_dim = n_projs

                # Task specific layer
                if l == n_layers_sub1 - 1 and task_specific_layer:
                    self.rnn_sub1 = rnn_i(self._output_dim,
                                          n_units,
                                          1,
                                          bias=True,
                                          batch_first=True,
                                          dropout=0,
                                          bidirectional=self.bidirectional)
                    self.dropout_sub1 = nn.Dropout(p=dropout)
                    if last_proj_dim != self.output_dim:
                        self.bridge_sub1 = Linear(n_units, last_proj_dim)
                if l == n_layers_sub2 - 1 and task_specific_layer:
                    self.rnn_sub2 = rnn_i(self._output_dim,
                                          n_units,
                                          1,
                                          bias=True,
                                          batch_first=True,
                                          dropout=0,
                                          bidirectional=self.bidirectional)
                    self.dropout_sub2 = nn.Dropout(p=dropout)
                    if last_proj_dim != self.output_dim:
                        self.bridge_sub2 = Linear(n_units, last_proj_dim)

                # Network in network
                if self.nin is not None:
                    if l != n_layers - 1:
                        self.nin += [NiN(self._output_dim)]
                    # if n_layers_sub1 > 0 or n_layers_sub2 > 0:
                    #     assert task_specific_layer

            if last_proj_dim != self.output_dim:
                self.bridge = Linear(self._output_dim, last_proj_dim)
                self._output_dim = last_proj_dim

        # Initialize parameters
        self.reset_parameters(param_init)