def __init__(self,
                 in_size,
                 out_size,
                 projection_size=[512],
                 projection_fn='LeakyReLU',
                 projection_do=0.0,
                 cbhg_cfg={}):
        super().__init__()
        self.in_size = in_size
        self.out_size = out_size
        self.projection_size = projection_size
        self.projection_fn = projection_fn
        self.projection_do = ConfigParser.list_parser(projection_do,
                                                      n=len(projection_size))

        self.inverter_lyr = CBHG1d(in_size,
                                   conv_proj_filter=[256, in_size],
                                   **cbhg_cfg)
        _tmp = []
        prev_size = self.inverter_lyr.out_features
        for ii in range(len(projection_size)):
            _tmp.append(nn.Linear(prev_size, self.projection_size[ii]))
            _tmp.append(generator_act_module(self.projection_fn))
            _tmp.append(nn.Dropout(p=self.projection_do[ii]))
            prev_size = self.projection_size[ii]
            pass
        _tmp.append(nn.Linear(prev_size, out_size))
        self.projection_lyr = nn.Sequential(*_tmp)
        pass
Beispiel #2
0
    def __init__(self,
                 speaker_emb_dim=256,
                 speaker_proj_size=[512],
                 speaker_proj_fn='none',
                 speaker_proj_do=0.0,
                 speaker_integrate_fn='none',
                 speaker_emb_scale=1.0,
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.speaker_emb_dim = speaker_emb_dim
        self.speaker_proj_size = speaker_proj_size
        self.speaker_proj_fn = speaker_proj_fn
        self.speaker_proj_do = ConfigParser.list_parser(
            speaker_proj_do, n=len(speaker_proj_size))
        self.speaker_emb_scale = speaker_emb_scale  # scalar x spk_vector
        # speaker_integrate_fn applied before non-linearity on decoder layer
        self.speaker_integrate_fn = speaker_integrate_fn

        _tmp = []
        prev_size = speaker_emb_dim
        for ii in range(len(self.speaker_proj_size)):
            _tmp.append(nn.Linear(prev_size, self.speaker_proj_size[ii]))
            _tmp.append(generator_act_module(self.speaker_proj_fn))
            _tmp.append(nn.Dropout(self.speaker_proj_do[ii]))
            prev_size = self.speaker_proj_size[ii]

        self.speaker_proj_lyr = nn.Sequential(*_tmp)
        self.speaker_module_lyr = nn.Module()

        # speaker proj -> decoder prenet (last layer) #
        self.speaker_module_lyr.add_module(
            'dec_proj_prenet_lyr',
            nn.Linear(prev_size, self.dec_prenet_lyr[-1].out_features))

        # speaker proj -> decoder regression core (first layer) #
        assert len(self.dec_core_gen_lyr
                   ) >= 1, "dec_core_gen_lyr must have atleast 1 layer"
        self.speaker_module_lyr.add_module(
            'dec_proj_core_gen_lyr',
            nn.Linear(prev_size, self.dec_core_gen_lyr[0].out_features))
        pass
Beispiel #3
0
    def __init__(self,
                 dec_bern_end_size=[256],
                 dec_bern_end_fn='Tanh',
                 dec_bern_end_do=0.0,
                 *args,
                 **kwargs):
        super(TACOTRONBernoulliEnd, self).__init__(*args, **kwargs)
        self.dec_bern_end_size = dec_bern_end_size
        self.dec_bern_end_fn = dec_bern_end_fn
        self.dec_bern_end_do = ConfigParser.list_parser(dec_bern_end_do)

        # p(t = frame stop | dec_hid[t], y[t]) #
        _tmp = []
        prev_size = self.dec_att_lyr.output_size + self.dec_out_size
        for ii in range(len(dec_bern_end_size)):
            _tmp.append(nn.Linear(prev_size, self.dec_bern_end_size[ii]))
            _tmp.append(generator_act_module(self.dec_bern_end_fn))
            _tmp.append(nn.Dropout(p=self.dec_bern_end_do[ii]))
            prev_size = self.dec_bern_end_size[ii]

        _tmp.append(nn.Linear(prev_size, 1))
        # output is logit, not transformed into sigmoid #
        self.dec_bernoulli_end_lyr = nn.Sequential(*_tmp)
        pass
Beispiel #4
0
    def __init__(
            self,
            enc_in_size,
            dec_in_size,
            dec_out_size,
            enc_emb_size=256,
            enc_emb_do=0.0,
            enc_prenet_size=[256, 128],
            enc_prenet_do=[0.5, 0.5],
            enc_prenet_fn='leaky_relu',
            dec_prenet_size=[256, 128],
            dec_prenet_do=[0.5, 0.5],
            dec_prenet_fn='leaky_relu',
            dec_rnn_sizes=[256, 256],
            dec_rnn_cfgs={"type": "lstm"},
            dec_rnn_do=0.0,
            dec_cfg={"type": "standard_decoder"},
            att_cfg={"type": "mlp"},
            dec_core_gen_size=[512],
            dec_core_gen_fn='leaky_relu',
            dec_core_gen_do=0.0,
            # CBHG #
            enc_cbhg_cfg={},
            # FRAME ENDING #
            dec_bern_end_size=[256],
            dec_bern_end_fn='LeakyReLU',
            dec_bern_end_do=0.0,
            # OPTIONAL #
            dec_in_range=None):
        """
        Args:
            enc_in_size : size of vocab
            dec_in_size : input (mel) dim size
            dec_out_size : output (mel) dim size (usually same as dec_in_size)
            dec_in_range : 
                pair of integer [x, y] \in [0, dec_in_size], 
                all dims outside this pair will be masked as 0
                in Tacotron paper, they only use last time-step instead of all group
        """
        super().__init__()
        self.enc_in_size = enc_in_size
        self.dec_in_size = dec_in_size
        self.dec_out_size = dec_out_size  # mel spec dim size
        self.enc_emb_size = enc_emb_size
        self.enc_emb_do = enc_emb_do
        self.enc_prenet_size = enc_prenet_size
        self.enc_prenet_do = ConfigParser.list_parser(enc_prenet_do,
                                                      len(enc_prenet_size))
        self.enc_prenet_fn = enc_prenet_fn
        self.dec_prenet_size = dec_prenet_size
        self.dec_prenet_do = ConfigParser.list_parser(dec_prenet_do,
                                                      len(dec_prenet_size))
        self.dec_prenet_fn = dec_prenet_fn
        self.dec_rnn_sizes = dec_rnn_sizes
        self.dec_rnn_cfgs = dec_rnn_cfgs
        self.dec_rnn_do = dec_rnn_do
        self.dec_core_gen_size = dec_core_gen_size
        self.dec_core_gen_fn = dec_core_gen_fn
        self.dec_core_gen_do = ConfigParser.list_parser(
            dec_core_gen_do, len(dec_core_gen_size))
        self.dec_cfg = dec_cfg
        self.att_cfg = att_cfg

        # FRAME ENDING #
        self.dec_bern_end_size = dec_bern_end_size
        self.dec_bern_end_fn = dec_bern_end_fn
        self.dec_bern_end_do = ConfigParser.list_parser(dec_bern_end_do)

        # OPTIONAL #
        self.dec_in_range = dec_in_range
        if self.dec_in_range is not None:
            assert isinstance(self.dec_in_range, (list, tuple)) \
                    and len(self.dec_in_range) == 2

        # CBHG config #
        self.enc_cbhg_cfg = ConfigParser.item_parser(enc_cbhg_cfg)

        self.enc_emb_lyr = nn.Embedding(enc_in_size, enc_emb_size)
        # init enc prenet #
        self.enc_prenet_lyr = nn.ModuleList()
        prev_size = enc_emb_size
        for ii in range(len(self.enc_prenet_size)):
            self.enc_prenet_lyr.append(
                nn.Linear(prev_size, enc_prenet_size[ii]))
            prev_size = enc_prenet_size[ii]
        # init enc middle #
        self.enc_core_lyr = cbhg.CBHG1d(prev_size, **enc_cbhg_cfg)
        # init dec prenet #
        self.dec_prenet_lyr = nn.ModuleList()
        prev_size = dec_in_size if self.dec_in_range is None else (
            (self.dec_in_range[-1] or 0) - (self.dec_in_range[-2] or 0))
        for ii in range(len(self.dec_prenet_size)):
            self.dec_prenet_lyr.append(
                nn.Linear(prev_size, dec_prenet_size[ii]))
            prev_size = dec_prenet_size[ii]

        # init dec rnn #
        _dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs,
                                                 len(dec_rnn_sizes))
        for ii in range(len(dec_rnn_sizes)):
            _type = _dec_rnn_cfgs[ii]['type']
            if re.match('stateful.*cell', _type) is None:
                _dec_rnn_cfgs[ii]['type'] = 'stateful_{}cell'.format(_type)
        # TODO : dec_cfg #
        final_enc_size = self.enc_core_lyr.output_size
        self.dec_att_lyr = decoder.StandardDecoder(att_cfg, final_enc_size,
                                                   prev_size, dec_rnn_sizes,
                                                   dec_rnn_cfgs, dec_rnn_do)

        # init decoder layer melspec generator #
        prev_size = self.dec_att_lyr.output_size
        self.dec_core_gen_lyr = nn.ModuleList()
        for ii in range(len(self.dec_core_gen_size)):
            self.dec_core_gen_lyr.append(
                nn.Linear(prev_size, self.dec_core_gen_size[ii]))
            prev_size = self.dec_core_gen_size[ii]
        self.dec_core_gen_lyr.append(nn.Linear(prev_size, self.dec_out_size))

        # init decoder frame ending predictor #
        # p(t=STOP | dec_hid[t], y[t]) #
        _tmp = []
        prev_size = self.dec_att_lyr.output_size + self.dec_out_size
        for ii in range(len(dec_bern_end_size)):
            _tmp.append(nn.Linear(prev_size, self.dec_bern_end_size[ii]))
            _tmp.append(generator_act_module(self.dec_bern_end_fn))
            _tmp.append(nn.Dropout(p=self.dec_bern_end_do[ii]))
            prev_size = self.dec_bern_end_size[ii]
        _tmp.append(nn.Linear(prev_size, 1))
        # output is logit, not transformed into sigmoid #
        self.dec_bernoulli_end_lyr = nn.Sequential(*_tmp)
Beispiel #5
0
    def __init__(
            self,
            enc_in_size,
            dec_in_size,
            dec_out_size,
            enc_emb_size=256,
            enc_emb_do=0.0,
            enc_conv_sizes=[5, 5, 5],
            enc_conv_filter=[256, 256, 256],
            enc_conv_do=0.25,
            enc_conv_fn='LeakyReLU',
            enc_rnn_sizes=[256],
            enc_rnn_cfgs={
                "type": "lstm",
                'bi': True
            },
            enc_rnn_do=0.2,
            dec_prenet_size=[256, 256],
            dec_prenet_fn='leaky_relu',
            dec_prenet_do=0.25,
            dec_rnn_sizes=[512, 512],
            dec_rnn_cfgs={"type": "lstm"},
            dec_rnn_do=0.2,
            dec_proj_size=[512, 512],
            dec_proj_fn='leaky_relu',
            dec_proj_do=0.25,
            dec_bern_end_size=[256],
            dec_bern_end_do=0.0,
            dec_bern_end_fn='LeakyReLU',
            dec_cfg={"type": "standard_decoder"},
            att_cfg={
                "type": "mlp_history",
                "kwargs": {
                    "history_conv_ksize": [2, 4, 8]
                }
            },  # location sensitive attention
            # OPTIONAL #
        dec_in_range=None,
            use_bn=False,  # Tacotron V2 default activate BatchNorm
            use_ln=False,  # Use layer-normalization on feedforward
    ):
        """
        Tacotron V2 
        
        Decoder generates 2 outputs mel + linear spec, use main for conditional input next step
        Args:
            enc_in_size : size of vocab
            dec_in_size : input (mel) dim size
            dec_out_size : output (mel/linear) dim size (usually same as dec_in_size)
            dec_in_range : 
                pair of integer [x, y] \in [0, dec_in_size], 
                all dims outside this pair will be masked as 0
                in Tacotron paper, they only use last time-step instead of all group
        """
        super().__init__()
        self.enc_in_size = enc_in_size
        self.dec_in_size = dec_in_size
        self.dec_out_size = dec_out_size  # output projection -> mel/linear spec #

        self.enc_emb_size = enc_emb_size
        self.enc_emb_do = enc_emb_do

        self.enc_conv_sizes = enc_conv_sizes
        self.enc_conv_filter = enc_conv_filter
        self.enc_conv_do = ConfigParser.list_parser(enc_conv_do,
                                                    len(enc_conv_sizes))
        self.enc_conv_fn = enc_conv_fn

        self.enc_rnn_sizes = enc_rnn_sizes
        self.enc_rnn_do = ConfigParser.list_parser(enc_rnn_do,
                                                   len(enc_rnn_sizes))
        self.enc_rnn_cfgs = ConfigParser.list_parser(enc_rnn_cfgs,
                                                     len(enc_rnn_sizes))

        self.dec_prenet_size = dec_prenet_size
        self.dec_prenet_do = ConfigParser.list_parser(dec_prenet_do,
                                                      len(dec_prenet_size))
        self.dec_prenet_fn = dec_prenet_fn
        self.dec_rnn_sizes = dec_rnn_sizes
        self.dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs,
                                                     len(dec_rnn_sizes))
        self.dec_rnn_do = ConfigParser.list_parser(dec_rnn_do,
                                                   len(dec_rnn_sizes))

        self.dec_proj_size = dec_proj_size
        self.dec_proj_fn = dec_proj_fn
        self.dec_proj_do = ConfigParser.list_parser(dec_proj_do,
                                                    len(dec_proj_size))

        self.dec_bern_end_size = dec_bern_end_size
        self.dec_bern_end_do = ConfigParser.list_parser(
            dec_bern_end_do, len(dec_bern_end_size))
        self.dec_bern_end_fn = dec_bern_end_fn

        self.dec_cfg = dec_cfg
        self.att_cfg = att_cfg
        self.use_bn = use_bn
        self.use_ln = use_ln
        if use_ln == True:
            raise ValueError("Layer Normalization is not supported yet!")

        # OPTIONAL #
        self.dec_in_range = dec_in_range
        if self.dec_in_range is not None:
            assert isinstance(self.dec_in_range, (list, tuple)) \
                    and len(self.dec_in_range) == 2
        ### FINISH ###

        # init emb layer
        self.enc_emb_lyr = nn.Embedding(enc_in_size, enc_emb_size)

        # init enc conv #
        _tmp = []
        prev_size = enc_emb_size
        for ii in range(len(self.enc_conv_sizes)):
            _tmp.append(
                Conv1dEv(prev_size,
                         self.enc_conv_filter[ii],
                         self.enc_conv_sizes[ii],
                         padding='same'))
            _tmp.append(generator_act_module(self.enc_conv_fn))
            if self.use_bn:
                _tmp.append(nn.BatchNorm1d(self.enc_conv_filter[ii]))
            _tmp.append(nn.Dropout(p=self.enc_conv_do[ii]))
            prev_size = self.enc_conv_filter[ii]
        self.enc_conv_lyr = nn.Sequential(*_tmp)

        # init enc rnn #
        self.enc_rnn_lyr = nn.ModuleList()
        _enc_rnn_cfgs = ConfigParser.list_parser(enc_rnn_cfgs,
                                                 len(enc_rnn_sizes))
        for ii in range(len(self.enc_rnn_sizes)):
            _rnn_cfg = {}
            _rnn_cfg['type'] = _enc_rnn_cfgs[ii]['type']
            _rnn_cfg['args'] = [
                prev_size, enc_rnn_sizes[ii], 1, True, True, 0,
                _enc_rnn_cfgs[ii]['bi']
            ]
            self.enc_rnn_lyr.append(generator_rnn(_rnn_cfg))
            prev_size = enc_rnn_sizes[ii]

        # init dec prenet #
        _tmp = []
        prev_size = dec_in_size if self.dec_in_range is None else (
            (self.dec_in_range[-1] or 0) - (self.dec_in_range[-2] or 0))
        for ii in range(len(self.dec_prenet_size)):
            _tmp.append(nn.Linear(prev_size, self.dec_prenet_size[ii]))
            prev_size = self.dec_prenet_size[ii]

        self.dec_prenet_lyr = nn.ModuleList(_tmp)

        # init dec rnn #
        _dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs,
                                                 len(dec_rnn_sizes))
        for ii in range(len(dec_rnn_sizes)):
            _type = _dec_rnn_cfgs[ii]['type']
            if re.match('stateful.*cell', _type) is None:
                _dec_rnn_cfgs[ii]['type'] = 'stateful_{}cell'.format(_type)

        final_enc_size = self.enc_rnn_lyr[-1].hidden_size * (
            2 if self.enc_rnn_lyr[-1].bidirectional else 1)
        assert 'type' in dec_cfg, "decoder type need to be defined"
        if dec_cfg['type'] == 'standard_decoder':
            _tmp_dec_cfg = dict(dec_cfg)
            del _tmp_dec_cfg['type']  #
            self.dec_att_lyr = decoder.StandardDecoder(att_cfg=att_cfg,
                                                       ctx_size=final_enc_size,
                                                       in_size=prev_size,
                                                       rnn_sizes=dec_rnn_sizes,
                                                       rnn_cfgs=dec_rnn_cfgs,
                                                       rnn_do=dec_rnn_do,
                                                       **_tmp_dec_cfg)

        # init dec lin proj -> mel/linear-spec
        prev_size = self.dec_att_lyr.out_features
        _tmp = []
        for ii in range(len(self.dec_proj_size)):
            _tmp.append(nn.Linear(prev_size, self.dec_proj_size[ii]))
            prev_size = self.dec_proj_size[ii]
        _tmp.append(nn.Linear(prev_size, self.dec_out_size))
        self.dec_proj_lyr = nn.ModuleList(_tmp)

        # init dec bern end layer
        _tmp = []
        prev_size = self.dec_out_size + self.dec_att_lyr.out_features + (
            self.enc_rnn_lyr[-1].hidden_size *
            (2 if self.enc_rnn_lyr[-1].bidirectional else 1))
        for ii in range(len(self.dec_bern_end_size)):
            _tmp.append(nn.Linear(prev_size, self.dec_bern_end_size[ii]))
            _tmp.append(generator_act_module(dec_bern_end_fn))
            _tmp.append(nn.Dropout(self.dec_bern_end_do[ii]))
            prev_size = self.dec_bern_end_size[ii]
            pass
        _tmp.append(nn.Linear(prev_size, 1))
        self.dec_bern_end_lyr = nn.Sequential(*_tmp)
        pass
Beispiel #6
0
    def __init__(
        self,
        enc_in_size,
        dec_in_size,
        dec_out_size,
        enc_fnn_sizes=[512],
        enc_fnn_act='LeakyReLU',
        enc_fnn_do=0.25,
        enc_rnn_sizes=[256, 256, 256],
        enc_rnn_cfgs={
            "type": "lstm",
            "bi": True
        },
        enc_rnn_do=0.25,
        downsampling=[False, True, True],
        dec_emb_size=256,
        dec_emb_do=0.25,
        dec_emb_tied_weight=True,
        # tying weight from char/word embedding with softmax layer
        dec_rnn_sizes=[512, 512],
        dec_rnn_cfgs={"type": "lstm"},
        dec_rnn_do=0.25,
        dec_cfg={"type": "standard_decoder"},
        att_cfg={"type": "mlp"},
        use_layernorm=False,
    ):
        super().__init__()

        self.enc_in_size = enc_in_size
        self.dec_in_size = dec_in_size
        self.dec_out_size = dec_out_size
        self.enc_fnn_sizes = enc_fnn_sizes
        self.enc_fnn_act = enc_fnn_act
        self.enc_fnn_do = ConfigParser.list_parser(enc_fnn_do,
                                                   len(enc_fnn_sizes))
        self.enc_rnn_sizes = enc_rnn_sizes
        self.enc_rnn_cfgs = enc_rnn_cfgs
        self.enc_rnn_do = ConfigParser.list_parser(enc_rnn_do,
                                                   len(enc_rnn_sizes))
        self.downsampling = ConfigParser.list_parser(downsampling,
                                                     len(enc_rnn_sizes))

        self.dec_emb_size = dec_emb_size
        self.dec_emb_do = dec_emb_do
        self.dec_emb_tied_weight = dec_emb_tied_weight
        self.dec_rnn_sizes = dec_rnn_sizes
        self.dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs,
                                                     len(dec_rnn_sizes))
        self.dec_rnn_do = ConfigParser.list_parser(dec_rnn_do,
                                                   len(dec_rnn_sizes))
        self.dec_cfg = dec_cfg
        self.att_cfg = att_cfg

        self.use_layernorm = use_layernorm
        if self.use_layernorm == True:
            raise ValueError("LayerNorm is not implemented yet")

        # modules #
        # init encoder #
        prev_size = enc_in_size
        _tmp = []
        for ii in range(len(enc_fnn_sizes)):
            _tmp.append(nn.Linear(prev_size, enc_fnn_sizes[ii]))
            if use_layernorm:
                _tmp.append(LayerNorm(enc_fnn_sizes[ii]))
            _tmp.append(generator_act_module(enc_fnn_act))
            _tmp.append(nn.Dropout(p=self.enc_fnn_do[ii]))
            prev_size = enc_fnn_sizes[ii]
        self.enc_fnn_lyr = nn.Sequential(*_tmp)

        self.enc_rnn_lyr = nn.ModuleList()
        _enc_rnn_cfgs = ConfigParser.list_parser(enc_rnn_cfgs,
                                                 len(enc_rnn_sizes))
        for ii in range(len(enc_rnn_sizes)):
            _rnn_cfg = {}
            _rnn_cfg['type'] = _enc_rnn_cfgs[ii]['type']
            _rnn_cfg['args'] = [
                prev_size, enc_rnn_sizes[ii], 1, True, True, 0,
                _enc_rnn_cfgs[ii]['bi']
            ]
            self.enc_rnn_lyr.append(generator_rnn(_rnn_cfg))
            prev_size = enc_rnn_sizes[ii] * (2
                                             if _enc_rnn_cfgs[ii]['bi'] else 1)
        final_enc_size = prev_size
        # init decoder #
        self.dec_emb_lyr = nn.Embedding(self.dec_in_size,
                                        dec_emb_size,
                                        padding_idx=None)
        prev_size = dec_emb_size
        _dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs,
                                                 len(dec_rnn_sizes))
        for ii in range(len(dec_rnn_sizes)):
            _type = _dec_rnn_cfgs[ii]['type']
            if re.match('stateful.*cell', _type) is None:
                _dec_rnn_cfgs[ii]['type'] = 'stateful_{}cell'.format(_type)
        # TODO : dec_cfg #
        assert 'type' in dec_cfg, "decoder type need to be defined"
        if dec_cfg['type'] == 'standard_decoder':
            _tmp_dec_cfg = dict(dec_cfg)
            del _tmp_dec_cfg['type']  #
            self.dec_att_lyr = decoder.StandardDecoder(att_cfg=att_cfg,
                                                       ctx_size=final_enc_size,
                                                       in_size=dec_emb_size,
                                                       rnn_sizes=dec_rnn_sizes,
                                                       rnn_cfgs=_dec_rnn_cfgs,
                                                       rnn_do=dec_rnn_do,
                                                       **_tmp_dec_cfg)
        else:
            raise NotImplementedError("decoder type {} is not found".format(
                dec_cfg['type']))
        self.dec_presoftmax_lyr = nn.Linear(self.dec_att_lyr.output_size,
                                            dec_out_size)
        if dec_emb_tied_weight:
            assert dec_out_size == dec_in_size and self.dec_emb_lyr.embedding_dim == self.dec_presoftmax_lyr.in_features
            self.dec_presoftmax_lyr.weight = self.dec_emb_lyr.weight
        pass