def __init__(self, in_size, out_size, conv_bank_k=8, conv_fn_act='leaky_relu', conv_bank_filter=[128, 128], conv_do=0.25): super().__init__() self.in_size = in_size self.out_size = out_size self.conv_bank_filter = conv_bank_filter self.conv_bank_k = ConfigParser.list_parser(conv_bank_k, len(self.conv_bank_filter)) self.conv_fn_act = conv_fn_act self.conv_do = ConfigParser.list_parser(conv_do, len(self.conv_bank_filter)) self.conv_bank_lyrs = nn.ModuleList() prev_size = in_size for ii in range(len(conv_bank_filter)): self.conv_bank_lyrs.append( MultiscaleConv1d(prev_size, out_channels=conv_bank_filter[ii], kernel_sizes=list( range(1, self.conv_bank_k[ii] + 1)), padding='same')) prev_size = self.conv_bank_lyrs[-1].out_channels pass self.lin_pred_lyr = nn.Linear(prev_size, out_size) pass
def __init__(self, in_size, out_size, projection_size=[512], projection_fn='LeakyReLU', projection_do=0.0, cbhg_cfg={}): super().__init__() self.in_size = in_size self.out_size = out_size self.projection_size = projection_size self.projection_fn = projection_fn self.projection_do = ConfigParser.list_parser(projection_do, n=len(projection_size)) self.inverter_lyr = CBHG1d(in_size, conv_proj_filter=[256, in_size], **cbhg_cfg) _tmp = [] prev_size = self.inverter_lyr.out_features for ii in range(len(projection_size)): _tmp.append(nn.Linear(prev_size, self.projection_size[ii])) _tmp.append(generator_act_module(self.projection_fn)) _tmp.append(nn.Dropout(p=self.projection_do[ii])) prev_size = self.projection_size[ii] pass _tmp.append(nn.Linear(prev_size, out_size)) self.projection_lyr = nn.Sequential(*_tmp) pass
def __init__(self, enc_in_size, dec_in_size, dec_out_size, dec_rnn_sizes=[512, 512], dec_rnn_cfgs={'type': 'lstm'}, dec_rnn_do=0.25, dec_cfg={'type': 'standard_decoder'}, att_cfg={'type': 'mlp'}): super().__init__() self.enc_in_size = enc_in_size self.dec_in_size = dec_in_size self.dec_out_size = dec_out_size self.dec_rnn_sizes = dec_rnn_sizes self.dec_rnn_cfgs = dec_rnn_cfgs self.dec_rnn_do = ConfigParser.list_parser(dec_rnn_do, len(dec_rnn_sizes)) self.dec_cfg = dec_cfg self.att_cfg = att_cfg # init encoder # self.enc_lyr = encoder.StandardRNNEncoder(enc_in_size, do=0.0, downsampling={ 'type': 'last', 'step': 2 }) # init decoder # _dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) for ii in range(len(dec_rnn_sizes)): _type = _dec_rnn_cfgs[ii]['type'] if re.match('stateful.*cell', _type) is None: _dec_rnn_cfgs[ii]['type'] = 'stateful_{}cell'.format(_type) prev_size = dec_in_size self.dec_lyr = decoder.StandardDecoder(att_cfg, self.enc_lyr.out_features, prev_size, dec_rnn_sizes, _dec_rnn_cfgs, self.dec_rnn_do) # init decoder regression # self.dec_core_reg_lyr = nn.Linear(self.dec_lyr.out_features, dec_out_size) pass
def __init__(self, enc_in_size, dec_in_size, n_class, enc_fnn_sizes=[512], enc_fnn_act='tanh', enc_fnn_do=0.25, enc_rnn_sizes=[256, 256, 256], enc_rnn_cfgs={"type":"lstm", "bi":True}, enc_rnn_do=0.25, downsampling=None, dec_emb_size=64, dec_emb_do=0.0, dec_rnn_sizes=[512, 512], dec_rnn_cfgs={"type":"lstm"}, dec_rnn_do=0.25, dec_cfg={"type":"standard_decoder"}, att_cfg={"type":"mlp"}, ) : super(ENCRNN_DECRNN_ATT_ASR, self).__init__() self.enc_in_size = enc_in_size self.dec_in_size = dec_in_size self.n_class = n_class self.enc_fnn_sizes = enc_fnn_sizes self.enc_fnn_act = enc_fnn_act self.enc_fnn_do = ConfigParser.list_parser(enc_fnn_do, len(enc_fnn_sizes)) self.enc_rnn_sizes = enc_rnn_sizes self.enc_rnn_cfgs = enc_rnn_cfgs self.enc_rnn_do = ConfigParser.list_parser(enc_rnn_do, len(enc_rnn_sizes)) self.downsampling = ConfigParser.list_parser(downsampling, len(enc_rnn_sizes)) self.dec_emb_size = dec_emb_size self.dec_emb_do = dec_emb_do self.dec_rnn_sizes = dec_rnn_sizes self.dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) self.dec_rnn_do = ConfigParser.list_parser(dec_rnn_do, len(dec_rnn_sizes)) self.dec_cfg = dec_cfg self.att_cfg = att_cfg # modules # # init encoder # self.enc_fnn = nn.ModuleList() prev_size = enc_in_size for ii in range(len(enc_fnn_sizes)) : self.enc_fnn.append(nn.Linear(prev_size, enc_fnn_sizes[ii])) prev_size = enc_fnn_sizes[ii] self.enc_rnn = nn.ModuleList() _enc_rnn_cfgs = ConfigParser.list_parser(enc_rnn_cfgs, len(enc_rnn_sizes)) for ii in range(len(enc_rnn_sizes)) : _rnn_cfg = {} _rnn_cfg['type'] = _enc_rnn_cfgs[ii]['type'] _rnn_cfg['args'] = [prev_size, enc_rnn_sizes[ii], 1, True, True, 0, _enc_rnn_cfgs[ii]['bi']] self.enc_rnn.append(generator_rnn(_rnn_cfg)) prev_size = enc_rnn_sizes[ii] * (2 if _enc_rnn_cfgs[ii]['bi'] else 1) final_enc_size = prev_size # init decoder # self.dec_emb = nn.Embedding(self.dec_in_size, dec_emb_size, padding_idx=None) prev_size = dec_emb_size _dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) for ii in range(len(dec_rnn_sizes)) : _type = _dec_rnn_cfgs[ii]['type'] if re.match('stateful.*cell', _type) is None : _dec_rnn_cfgs[ii]['type'] = 'stateful_{}cell'.format(_type) # TODO : dec_cfg # self.dec = decoder.StandardDecoder(att_cfg, final_enc_size, dec_emb_size, dec_rnn_sizes, _dec_rnn_cfgs, dec_rnn_do) self.pre_softmax = nn.Linear(self.dec.output_size, n_class) pass
def __init__(self, in_size, rnn_sizes=[512, 512], rnn_cfgs={'type':'lstm', 'bi':True}, do=0.25, downsampling={'type':'none'}) : super().__init__() self.in_size = in_size self.rnn_sizes = rnn_sizes self.rnn_cfgs = rnn_cfgs self.do = ConfigParser.list_parser(do, len(self.rnn_sizes)) self.downsampling = ConfigParser.list_parser(downsampling, len(self.rnn_sizes)) # init rnn # self.rnn_lyr = nn.ModuleList() _rnn_cfgs = ConfigParser.list_parser(self.rnn_cfgs, len(rnn_sizes)) prev_size = in_size for ii in range(len(self.rnn_sizes)) : _rnn_cfg = {'type':_rnn_cfgs[ii]['type'], 'args':[prev_size, rnn_sizes[ii], 1, True, True, 0, _rnn_cfgs[ii].get('bi', False)]} self.rnn_lyr.append(generator_rnn(_rnn_cfg)) prev_size = self.rnn_lyr[ii].hidden_size * (2 if self.rnn_lyr[ii].bidirectional else 1) pass self.output_size = prev_size self.out_features = self.output_size pass
def __init__(self, in_size, out_size, hid_sizes=[512, 512], act_fn='leaky_relu', do=0.0) : super(Mel2SpecFNN, self).__init__() self.in_size = in_size self.out_size = out_size self.hid_sizes = hid_sizes self.act_fn = act_fn self.do = ConfigParser.list_parser(do, len(self.hid_sizes)) _fnns = [] prev_size = in_size for ii in range(len(self.hid_sizes)) : _fnns.append(nn.Linear(prev_size, hid_sizes[ii])) prev_size = hid_sizes[ii] self.layers = nn.ModuleList(_fnns) self.proj = nn.Linear(prev_size, out_size) pass
def __init__(self, speaker_emb_dim=256, speaker_proj_size=[512], speaker_proj_fn='none', speaker_proj_do=0.0, speaker_integrate_fn='none', speaker_emb_scale=1.0, *args, **kwargs): super().__init__(*args, **kwargs) self.speaker_emb_dim = speaker_emb_dim self.speaker_proj_size = speaker_proj_size self.speaker_proj_fn = speaker_proj_fn self.speaker_proj_do = ConfigParser.list_parser( speaker_proj_do, n=len(speaker_proj_size)) self.speaker_emb_scale = speaker_emb_scale # scalar x spk_vector # speaker_integrate_fn applied before non-linearity on decoder layer self.speaker_integrate_fn = speaker_integrate_fn _tmp = [] prev_size = speaker_emb_dim for ii in range(len(self.speaker_proj_size)): _tmp.append(nn.Linear(prev_size, self.speaker_proj_size[ii])) _tmp.append(generator_act_module(self.speaker_proj_fn)) _tmp.append(nn.Dropout(self.speaker_proj_do[ii])) prev_size = self.speaker_proj_size[ii] self.speaker_proj_lyr = nn.Sequential(*_tmp) self.speaker_module_lyr = nn.Module() # speaker proj -> decoder prenet (last layer) # self.speaker_module_lyr.add_module( 'dec_proj_prenet_lyr', nn.Linear(prev_size, self.dec_prenet_lyr[-1].out_features)) # speaker proj -> decoder regression core (first layer) # assert len(self.dec_core_gen_lyr ) >= 1, "dec_core_gen_lyr must have atleast 1 layer" self.speaker_module_lyr.add_module( 'dec_proj_core_gen_lyr', nn.Linear(prev_size, self.dec_core_gen_lyr[0].out_features)) pass
def __init__(self, dec_bern_end_size=[256], dec_bern_end_fn='Tanh', dec_bern_end_do=0.0, *args, **kwargs): super(TACOTRONBernoulliEnd, self).__init__(*args, **kwargs) self.dec_bern_end_size = dec_bern_end_size self.dec_bern_end_fn = dec_bern_end_fn self.dec_bern_end_do = ConfigParser.list_parser(dec_bern_end_do) # p(t = frame stop | dec_hid[t], y[t]) # _tmp = [] prev_size = self.dec_att_lyr.output_size + self.dec_out_size for ii in range(len(dec_bern_end_size)): _tmp.append(nn.Linear(prev_size, self.dec_bern_end_size[ii])) _tmp.append(generator_act_module(self.dec_bern_end_fn)) _tmp.append(nn.Dropout(p=self.dec_bern_end_do[ii])) prev_size = self.dec_bern_end_size[ii] _tmp.append(nn.Linear(prev_size, 1)) # output is logit, not transformed into sigmoid # self.dec_bernoulli_end_lyr = nn.Sequential(*_tmp) pass
def __init__( self, enc_in_size, dec_in_size, dec_out_size, enc_emb_size=256, enc_emb_do=0.0, enc_prenet_size=[256, 128], enc_prenet_do=[0.5, 0.5], enc_prenet_fn='leaky_relu', dec_prenet_size=[256, 128], dec_prenet_do=[0.5, 0.5], dec_prenet_fn='leaky_relu', dec_rnn_sizes=[256, 256], dec_rnn_cfgs={"type": "lstm"}, dec_rnn_do=0.0, dec_cfg={"type": "standard_decoder"}, att_cfg={"type": "mlp"}, dec_core_gen_size=[512], dec_core_gen_fn='leaky_relu', dec_core_gen_do=0.0, # CBHG # enc_cbhg_cfg={}, # FRAME ENDING # dec_bern_end_size=[256], dec_bern_end_fn='LeakyReLU', dec_bern_end_do=0.0, # OPTIONAL # dec_in_range=None): """ Args: enc_in_size : size of vocab dec_in_size : input (mel) dim size dec_out_size : output (mel) dim size (usually same as dec_in_size) dec_in_range : pair of integer [x, y] \in [0, dec_in_size], all dims outside this pair will be masked as 0 in Tacotron paper, they only use last time-step instead of all group """ super().__init__() self.enc_in_size = enc_in_size self.dec_in_size = dec_in_size self.dec_out_size = dec_out_size # mel spec dim size self.enc_emb_size = enc_emb_size self.enc_emb_do = enc_emb_do self.enc_prenet_size = enc_prenet_size self.enc_prenet_do = ConfigParser.list_parser(enc_prenet_do, len(enc_prenet_size)) self.enc_prenet_fn = enc_prenet_fn self.dec_prenet_size = dec_prenet_size self.dec_prenet_do = ConfigParser.list_parser(dec_prenet_do, len(dec_prenet_size)) self.dec_prenet_fn = dec_prenet_fn self.dec_rnn_sizes = dec_rnn_sizes self.dec_rnn_cfgs = dec_rnn_cfgs self.dec_rnn_do = dec_rnn_do self.dec_core_gen_size = dec_core_gen_size self.dec_core_gen_fn = dec_core_gen_fn self.dec_core_gen_do = ConfigParser.list_parser( dec_core_gen_do, len(dec_core_gen_size)) self.dec_cfg = dec_cfg self.att_cfg = att_cfg # FRAME ENDING # self.dec_bern_end_size = dec_bern_end_size self.dec_bern_end_fn = dec_bern_end_fn self.dec_bern_end_do = ConfigParser.list_parser(dec_bern_end_do) # OPTIONAL # self.dec_in_range = dec_in_range if self.dec_in_range is not None: assert isinstance(self.dec_in_range, (list, tuple)) \ and len(self.dec_in_range) == 2 # CBHG config # self.enc_cbhg_cfg = ConfigParser.item_parser(enc_cbhg_cfg) self.enc_emb_lyr = nn.Embedding(enc_in_size, enc_emb_size) # init enc prenet # self.enc_prenet_lyr = nn.ModuleList() prev_size = enc_emb_size for ii in range(len(self.enc_prenet_size)): self.enc_prenet_lyr.append( nn.Linear(prev_size, enc_prenet_size[ii])) prev_size = enc_prenet_size[ii] # init enc middle # self.enc_core_lyr = cbhg.CBHG1d(prev_size, **enc_cbhg_cfg) # init dec prenet # self.dec_prenet_lyr = nn.ModuleList() prev_size = dec_in_size if self.dec_in_range is None else ( (self.dec_in_range[-1] or 0) - (self.dec_in_range[-2] or 0)) for ii in range(len(self.dec_prenet_size)): self.dec_prenet_lyr.append( nn.Linear(prev_size, dec_prenet_size[ii])) prev_size = dec_prenet_size[ii] # init dec rnn # _dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) for ii in range(len(dec_rnn_sizes)): _type = _dec_rnn_cfgs[ii]['type'] if re.match('stateful.*cell', _type) is None: _dec_rnn_cfgs[ii]['type'] = 'stateful_{}cell'.format(_type) # TODO : dec_cfg # final_enc_size = self.enc_core_lyr.output_size self.dec_att_lyr = decoder.StandardDecoder(att_cfg, final_enc_size, prev_size, dec_rnn_sizes, dec_rnn_cfgs, dec_rnn_do) # init decoder layer melspec generator # prev_size = self.dec_att_lyr.output_size self.dec_core_gen_lyr = nn.ModuleList() for ii in range(len(self.dec_core_gen_size)): self.dec_core_gen_lyr.append( nn.Linear(prev_size, self.dec_core_gen_size[ii])) prev_size = self.dec_core_gen_size[ii] self.dec_core_gen_lyr.append(nn.Linear(prev_size, self.dec_out_size)) # init decoder frame ending predictor # # p(t=STOP | dec_hid[t], y[t]) # _tmp = [] prev_size = self.dec_att_lyr.output_size + self.dec_out_size for ii in range(len(dec_bern_end_size)): _tmp.append(nn.Linear(prev_size, self.dec_bern_end_size[ii])) _tmp.append(generator_act_module(self.dec_bern_end_fn)) _tmp.append(nn.Dropout(p=self.dec_bern_end_do[ii])) prev_size = self.dec_bern_end_size[ii] _tmp.append(nn.Linear(prev_size, 1)) # output is logit, not transformed into sigmoid # self.dec_bernoulli_end_lyr = nn.Sequential(*_tmp)
def __init__( self, enc_in_size, dec_in_size, dec_out_size, enc_emb_size=256, enc_emb_do=0.0, enc_conv_sizes=[5, 5, 5], enc_conv_filter=[256, 256, 256], enc_conv_do=0.25, enc_conv_fn='LeakyReLU', enc_rnn_sizes=[256], enc_rnn_cfgs={ "type": "lstm", 'bi': True }, enc_rnn_do=0.2, dec_prenet_size=[256, 256], dec_prenet_fn='leaky_relu', dec_prenet_do=0.25, dec_rnn_sizes=[512, 512], dec_rnn_cfgs={"type": "lstm"}, dec_rnn_do=0.2, dec_proj_size=[512, 512], dec_proj_fn='leaky_relu', dec_proj_do=0.25, dec_bern_end_size=[256], dec_bern_end_do=0.0, dec_bern_end_fn='LeakyReLU', dec_cfg={"type": "standard_decoder"}, att_cfg={ "type": "mlp_history", "kwargs": { "history_conv_ksize": [2, 4, 8] } }, # location sensitive attention # OPTIONAL # dec_in_range=None, use_bn=False, # Tacotron V2 default activate BatchNorm use_ln=False, # Use layer-normalization on feedforward ): """ Tacotron V2 Decoder generates 2 outputs mel + linear spec, use main for conditional input next step Args: enc_in_size : size of vocab dec_in_size : input (mel) dim size dec_out_size : output (mel/linear) dim size (usually same as dec_in_size) dec_in_range : pair of integer [x, y] \in [0, dec_in_size], all dims outside this pair will be masked as 0 in Tacotron paper, they only use last time-step instead of all group """ super().__init__() self.enc_in_size = enc_in_size self.dec_in_size = dec_in_size self.dec_out_size = dec_out_size # output projection -> mel/linear spec # self.enc_emb_size = enc_emb_size self.enc_emb_do = enc_emb_do self.enc_conv_sizes = enc_conv_sizes self.enc_conv_filter = enc_conv_filter self.enc_conv_do = ConfigParser.list_parser(enc_conv_do, len(enc_conv_sizes)) self.enc_conv_fn = enc_conv_fn self.enc_rnn_sizes = enc_rnn_sizes self.enc_rnn_do = ConfigParser.list_parser(enc_rnn_do, len(enc_rnn_sizes)) self.enc_rnn_cfgs = ConfigParser.list_parser(enc_rnn_cfgs, len(enc_rnn_sizes)) self.dec_prenet_size = dec_prenet_size self.dec_prenet_do = ConfigParser.list_parser(dec_prenet_do, len(dec_prenet_size)) self.dec_prenet_fn = dec_prenet_fn self.dec_rnn_sizes = dec_rnn_sizes self.dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) self.dec_rnn_do = ConfigParser.list_parser(dec_rnn_do, len(dec_rnn_sizes)) self.dec_proj_size = dec_proj_size self.dec_proj_fn = dec_proj_fn self.dec_proj_do = ConfigParser.list_parser(dec_proj_do, len(dec_proj_size)) self.dec_bern_end_size = dec_bern_end_size self.dec_bern_end_do = ConfigParser.list_parser( dec_bern_end_do, len(dec_bern_end_size)) self.dec_bern_end_fn = dec_bern_end_fn self.dec_cfg = dec_cfg self.att_cfg = att_cfg self.use_bn = use_bn self.use_ln = use_ln if use_ln == True: raise ValueError("Layer Normalization is not supported yet!") # OPTIONAL # self.dec_in_range = dec_in_range if self.dec_in_range is not None: assert isinstance(self.dec_in_range, (list, tuple)) \ and len(self.dec_in_range) == 2 ### FINISH ### # init emb layer self.enc_emb_lyr = nn.Embedding(enc_in_size, enc_emb_size) # init enc conv # _tmp = [] prev_size = enc_emb_size for ii in range(len(self.enc_conv_sizes)): _tmp.append( Conv1dEv(prev_size, self.enc_conv_filter[ii], self.enc_conv_sizes[ii], padding='same')) _tmp.append(generator_act_module(self.enc_conv_fn)) if self.use_bn: _tmp.append(nn.BatchNorm1d(self.enc_conv_filter[ii])) _tmp.append(nn.Dropout(p=self.enc_conv_do[ii])) prev_size = self.enc_conv_filter[ii] self.enc_conv_lyr = nn.Sequential(*_tmp) # init enc rnn # self.enc_rnn_lyr = nn.ModuleList() _enc_rnn_cfgs = ConfigParser.list_parser(enc_rnn_cfgs, len(enc_rnn_sizes)) for ii in range(len(self.enc_rnn_sizes)): _rnn_cfg = {} _rnn_cfg['type'] = _enc_rnn_cfgs[ii]['type'] _rnn_cfg['args'] = [ prev_size, enc_rnn_sizes[ii], 1, True, True, 0, _enc_rnn_cfgs[ii]['bi'] ] self.enc_rnn_lyr.append(generator_rnn(_rnn_cfg)) prev_size = enc_rnn_sizes[ii] # init dec prenet # _tmp = [] prev_size = dec_in_size if self.dec_in_range is None else ( (self.dec_in_range[-1] or 0) - (self.dec_in_range[-2] or 0)) for ii in range(len(self.dec_prenet_size)): _tmp.append(nn.Linear(prev_size, self.dec_prenet_size[ii])) prev_size = self.dec_prenet_size[ii] self.dec_prenet_lyr = nn.ModuleList(_tmp) # init dec rnn # _dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) for ii in range(len(dec_rnn_sizes)): _type = _dec_rnn_cfgs[ii]['type'] if re.match('stateful.*cell', _type) is None: _dec_rnn_cfgs[ii]['type'] = 'stateful_{}cell'.format(_type) final_enc_size = self.enc_rnn_lyr[-1].hidden_size * ( 2 if self.enc_rnn_lyr[-1].bidirectional else 1) assert 'type' in dec_cfg, "decoder type need to be defined" if dec_cfg['type'] == 'standard_decoder': _tmp_dec_cfg = dict(dec_cfg) del _tmp_dec_cfg['type'] # self.dec_att_lyr = decoder.StandardDecoder(att_cfg=att_cfg, ctx_size=final_enc_size, in_size=prev_size, rnn_sizes=dec_rnn_sizes, rnn_cfgs=dec_rnn_cfgs, rnn_do=dec_rnn_do, **_tmp_dec_cfg) # init dec lin proj -> mel/linear-spec prev_size = self.dec_att_lyr.out_features _tmp = [] for ii in range(len(self.dec_proj_size)): _tmp.append(nn.Linear(prev_size, self.dec_proj_size[ii])) prev_size = self.dec_proj_size[ii] _tmp.append(nn.Linear(prev_size, self.dec_out_size)) self.dec_proj_lyr = nn.ModuleList(_tmp) # init dec bern end layer _tmp = [] prev_size = self.dec_out_size + self.dec_att_lyr.out_features + ( self.enc_rnn_lyr[-1].hidden_size * (2 if self.enc_rnn_lyr[-1].bidirectional else 1)) for ii in range(len(self.dec_bern_end_size)): _tmp.append(nn.Linear(prev_size, self.dec_bern_end_size[ii])) _tmp.append(generator_act_module(dec_bern_end_fn)) _tmp.append(nn.Dropout(self.dec_bern_end_do[ii])) prev_size = self.dec_bern_end_size[ii] pass _tmp.append(nn.Linear(prev_size, 1)) self.dec_bern_end_lyr = nn.Sequential(*_tmp) pass
def __init__( self, enc_in_size, dec_in_size, n_class, enc_fnn_sizes=[512], enc_fnn_act='tanh', enc_fnn_do=0.25, enc_cnn_channels=256, enc_cnn_ksize=[5, 5, 5, 5], enc_cnn_do=0.25, enc_cnn_strides=[1, 1, 1, 1], enc_cnn_act='leaky_relu', dec_emb_size=64, dec_emb_do=0.0, dec_rnn_sizes=[512, 512], dec_rnn_cfgs={"type": "lstm"}, dec_rnn_do=0.25, dec_cfg={"type": "standard_decoder"}, att_cfg={"type": "mlp"}, ): super(ENCCNN_DECRNN_ATT_ASR, self).__init__() self.enc_in_size = enc_in_size self.dec_in_size = dec_in_size self.n_class = n_class self.enc_fnn_sizes = enc_fnn_sizes self.enc_fnn_act = enc_fnn_act self.enc_fnn_do = ConfigParser.list_parser(enc_fnn_do, len(enc_fnn_sizes)) self.enc_cnn_channels = enc_cnn_channels # use same size for highway # self.enc_cnn_ksize = enc_cnn_ksize self.enc_cnn_strides = enc_cnn_strides self.enc_cnn_do = ConfigParser.list_parser(enc_cnn_do, len(enc_cnn_ksize)) self.enc_cnn_act = enc_cnn_act self.dec_emb_size = dec_emb_size self.dec_emb_do = dec_emb_do self.dec_rnn_sizes = dec_rnn_sizes self.dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) self.dec_rnn_do = ConfigParser.list_parser(dec_rnn_do, len(dec_rnn_sizes)) self.dec_cfg = dec_cfg self.att_cfg = att_cfg # modules # # init encoder # self.enc_fnn = nn.ModuleList() prev_size = enc_in_size for ii in range(len(enc_fnn_sizes)): self.enc_fnn.append(nn.Linear(prev_size, enc_fnn_sizes[ii])) prev_size = enc_fnn_sizes[ii] self.enc_cnn = nn.ModuleList() self.use_pad1 = [] # batch x ndim x seq x 1# for ii in range(len(enc_cnn_ksize)): self.enc_cnn.append( nn.Conv2d(prev_size, enc_cnn_channels, kernel_size=(self.enc_cnn_ksize[ii], 1), stride=(self.enc_cnn_strides[ii], 1), padding=((self.enc_cnn_ksize[ii] - 1) // 2, 0))) self.use_pad1.append(True if self.enc_cnn_ksize[ii] % 2 == 0 else False) prev_size = enc_cnn_channels final_enc_size = prev_size # init position embedding function # self.pos_emb = nn.Linear(1, final_enc_size) # init decoder # self.dec_emb = nn.Embedding(self.dec_in_size, dec_emb_size, padding_idx=None) prev_size = dec_emb_size _dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) for ii in range(len(dec_rnn_sizes)): _type = _dec_rnn_cfgs[ii]['type'] if re.match('stateful.*cell', _type) is None: _dec_rnn_cfgs[ii]['type'] = 'stateful_{}cell'.format(_type) # TODO : dec_cfg # self.dec = decoder.StandardDecoder(att_cfg, final_enc_size, dec_emb_size, dec_rnn_sizes, _dec_rnn_cfgs, dec_rnn_do) self.pre_softmax = nn.Linear(self.dec.output_size, n_class) pass
def __init__(self, enc_in_size, dec_in_size, n_class, enc_fnn_sizes=[512], enc_fnn_act='tanh', enc_fnn_do=0.25, enc_rnn_sizes=[256, 256, 256], enc_rnn_cfgs={ "type": "lstm", "bi": True }, enc_rnn_do=0.25, downsampling=None, dec_emb_size=64, dec_emb_do=0.0, dec_rnn_sizes=[512, 512], dec_rnn_cfgs={"type": "lstm"}, dec_rnn_do=0.25, dec_cfg={"type": "standard_decoder"}, att_cfg={"type": "mlp"}, enc_prior_cfg=None, dec_prior_cfg={ 'type': 'normal', 'kwargs': { 'mu': 0, 'sigma': 1.0 } }): super().__init__() self.enc_in_size = enc_in_size self.dec_in_size = dec_in_size self.n_class = n_class self.enc_fnn_sizes = enc_fnn_sizes self.enc_fnn_act = enc_fnn_act self.enc_fnn_do = ConfigParser.list_parser(enc_fnn_do, len(enc_fnn_sizes)) self.enc_rnn_sizes = enc_rnn_sizes self.enc_rnn_cfgs = enc_rnn_cfgs self.enc_rnn_do = ConfigParser.list_parser(enc_rnn_do, len(enc_rnn_sizes)) self.downsampling = ConfigParser.list_parser(downsampling, len(enc_rnn_sizes)) self.dec_emb_size = dec_emb_size self.dec_emb_do = dec_emb_do self.dec_rnn_sizes = dec_rnn_sizes self.dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) self.dec_rnn_do = ConfigParser.list_parser(dec_rnn_do, len(dec_rnn_sizes)) self.dec_cfg = dec_cfg self.att_cfg = att_cfg self.enc_prior_cfg = enc_prior_cfg self.dec_prior_cfg = dec_prior_cfg # modules # # init encoder # # TODO : add bayesian encoder RNN # self.enc_fnn = nn.ModuleList() prev_size = enc_in_size for ii in range(len(enc_fnn_sizes)): self.enc_fnn.append(nn.Linear(prev_size, enc_fnn_sizes[ii])) prev_size = enc_fnn_sizes[ii] self.enc_rnn = nn.ModuleList() _enc_rnn_cfgs = ConfigParser.list_parser(enc_rnn_cfgs, len(enc_rnn_sizes)) for ii in range(len(enc_rnn_sizes)): _rnn_cfg = {} _rnn_cfg['type'] = _enc_rnn_cfgs[ii]['type'] _rnn_cfg['args'] = [ prev_size, enc_rnn_sizes[ii], 1, True, True, 0, _enc_rnn_cfgs[ii]['bi'] ] self.enc_rnn.append(generator_rnn(_rnn_cfg)) prev_size = enc_rnn_sizes[ii] * (2 if _enc_rnn_cfgs[ii]['bi'] else 1) final_enc_size = prev_size # init decoder # # TODO : add bayesian decoder RNN # self.dec_emb = nn.Embedding(self.dec_in_size, dec_emb_size, padding_idx=None) prev_size = dec_emb_size _dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) for ii in range(len(dec_rnn_sizes)): _type = _dec_rnn_cfgs[ii]['type'] if re.match('stateful.*cell', _type) is None: _dec_rnn_cfgs[ii]['type'] = 'stateful_{}cell'.format(_type) # TODO : dec_cfg # self.dec = decoder.StandardDecoder(att_cfg, final_enc_size, dec_emb_size, dec_rnn_sizes, _dec_rnn_cfgs, dec_rnn_do) if self.dec_prior_cfg is None: self.pre_softmax = nn.Linear(self.dec.output_size, n_class) else: self.pre_softmax = LinearBayes( self.dec.output_size, n_class, posterior_w=NormalRV(0, 0.05), posterior_b=NormalRV(0, 0.05), prior_w=generator_bayes_rv(self.dec_prior_cfg), prior_b=generator_bayes_rv(self.dec_prior_cfg)) pass
def __init__(self, in_size, conv_bank_k=8, conv_bank_act='leaky_relu', conv_bank_filter=128, pool_size=2, conv_proj_k=[3, 3], conv_proj_filter=[128, 128], conv_proj_act=['leaky_relu', 'none'], highway_size=128, highway_lyr=4, highway_act='leaky_relu', rnn_cfgs={ 'type': 'gru', 'bi': True }, rnn_sizes=[128], use_bn=True): super(CBHG1d, self).__init__() # conv bank multiscale # self.conv_bank_lyr = MultiscaleConv1d(in_size, conv_bank_filter, kernel_sizes=list( range(1, conv_bank_k + 1)), padding='same') if use_bn: self.conv_bank_bn = nn.BatchNorm1d(self.conv_bank_lyr.out_channels) self.conv_bank_act = conv_bank_act self.pool_lyr = MaxPool1dEv(pool_size, stride=1, padding='same') self.conv_proj_lyr = nn.ModuleList() if use_bn: self.conv_proj_bn = nn.ModuleList() prev_filter = self.conv_bank_lyr.out_channels for ii in range(len(conv_proj_k)): self.conv_proj_lyr.append( Conv1dEv(prev_filter, conv_proj_filter[ii], kernel_size=conv_proj_k[ii], padding='same')) if use_bn: self.conv_proj_bn.append(nn.BatchNorm1d(conv_proj_filter[ii])) prev_filter = conv_proj_filter[ii] self.conv_proj_act = conv_proj_act assert prev_filter == in_size self.pre_highway_lyr = nn.Linear(prev_filter, highway_size) self.highway_lyr = HighwayFNN(highway_size, highway_lyr, fn_act=generator_act_fn(highway_act)) self.highway_act = highway_act self.use_bn = use_bn self.rnn_lyr = nn.ModuleList() rnn_cfgs = ConfigParser.list_parser(rnn_cfgs, len(rnn_sizes)) prev_size = highway_size for ii in range(len(rnn_sizes)): _rnn_cfg = {} _rnn_cfg['type'] = rnn_cfgs[ii]['type'] _rnn_cfg['args'] = [ prev_size, rnn_sizes[ii], 1, True, True, 0, rnn_cfgs[ii]['bi'] ] self.rnn_lyr.append(generator_rnn(_rnn_cfg)) prev_size = rnn_sizes[ii] * (2 if rnn_cfgs[ii]['bi'] else 1) self.output_size = prev_size self.out_features = prev_size pass
def __init__( self, enc_in_size, dec_in_size, n_class, enc_cnn_sizes=[80, 25, 10, 5], enc_cnn_act='leaky_relu', enc_cnn_stride=[4, 2, 1, 1], enc_cnn_do=0.0, enc_cnn_filter=256, enc_cnn_gated=[False, False, False, False], use_bn=False, enc_nin_filter=[128, 128], enc_rnn_sizes=[256, 256, 256], enc_rnn_cfgs={ "type": "lstm", "bi": True }, enc_rnn_do=0.25, downsampling=None, dec_emb_size=64, dec_emb_do=0.0, dec_rnn_sizes=[512, 512], dec_rnn_cfgs={"type": "lstm"}, dec_rnn_do=0.25, dec_cfg={"type": "standard_decoder"}, att_cfg={"type": "mlp"}, ): super(ENCCNNRNN_DECRNN_ATT_ASR, self).__init__() self.enc_in_size = enc_in_size self.dec_in_size = dec_in_size self.n_class = n_class self.enc_cnn_sizes = enc_cnn_sizes self.enc_cnn_act = enc_cnn_act self.enc_cnn_gated = ConfigParser.list_parser(enc_cnn_gated, len(enc_cnn_sizes)) self.enc_cnn_stride = enc_cnn_stride self.enc_cnn_filter = ConfigParser.list_parser(enc_cnn_filter, len(enc_cnn_sizes)) self.enc_cnn_do = ConfigParser.list_parser(enc_cnn_do, len(enc_cnn_sizes)) self.use_bn = use_bn self.enc_nin_filter = enc_nin_filter self.enc_rnn_sizes = enc_rnn_sizes # kernel size # self.enc_rnn_cfgs = enc_rnn_cfgs self.enc_rnn_do = ConfigParser.list_parser(enc_rnn_do, len(enc_rnn_sizes)) self.downsampling = ConfigParser.list_parser(downsampling, len(enc_rnn_sizes)) self.dec_emb_size = dec_emb_size self.dec_emb_do = dec_emb_do self.dec_rnn_sizes = dec_rnn_sizes self.dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) self.dec_rnn_do = ConfigParser.list_parser(dec_rnn_do, len(dec_rnn_sizes)) self.dec_cfg = dec_cfg self.att_cfg = att_cfg # modules # # init encoder # self.enc_cnn = nn.ModuleList() self.enc_cnn_bn = nn.ModuleList() prev_size = enc_in_size prev_ch = 1 for ii in range(len(enc_cnn_sizes)): if self.enc_cnn_gated[ii]: _cnn_lyr = GatedConv2dLinearUnit else: _cnn_lyr = Conv2dEv self.enc_cnn.append( _cnn_lyr(prev_ch, self.enc_cnn_filter[ii], (self.enc_cnn_sizes[ii], 1), stride=(self.enc_cnn_stride[ii], 1), padding='valid', dilation=1)) self.enc_cnn_bn.append(nn.BatchNorm2d(self.enc_cnn_filter[ii])) prev_size = enc_cnn_sizes[ii] prev_ch = self.enc_cnn_filter[ii] self.enc_nin = nn.ModuleList() for ii in range(len(enc_nin_filter)): self.enc_nin = self.enc_nin.append( nn.Conv2d(prev_ch, enc_nin_filter[ii], [1, 1])) prev_ch = enc_nin_filter[ii] self.enc_raw_enc = nn.ModuleList( [self.enc_cnn, self.enc_cnn_bn, self.enc_nin]) prev_size = prev_ch # global pooling after conv # self.enc_rnn = nn.ModuleList() _enc_rnn_cfgs = ConfigParser.list_parser(enc_rnn_cfgs, len(enc_rnn_sizes)) for ii in range(len(enc_rnn_sizes)): _rnn_cfg = {} _rnn_cfg['type'] = _enc_rnn_cfgs[ii]['type'] _rnn_cfg['args'] = [ prev_size, enc_rnn_sizes[ii], 1, True, True, 0, _enc_rnn_cfgs[ii]['bi'] ] self.enc_rnn.append(generator_rnn(_rnn_cfg)) prev_size = enc_rnn_sizes[ii] * (2 if _enc_rnn_cfgs[ii]['bi'] else 1) final_enc_size = prev_size # init decoder # self.dec_emb = nn.Embedding(self.dec_in_size, dec_emb_size, padding_idx=None) prev_size = dec_emb_size _dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) for ii in range(len(dec_rnn_sizes)): _type = _dec_rnn_cfgs[ii]['type'] if re.match('stateful.*cell', _type) is None: _dec_rnn_cfgs[ii]['type'] = 'stateful_{}cell'.format(_type) # TODO : dec_cfg # self.dec = decoder.StandardDecoder(att_cfg, final_enc_size, dec_emb_size, dec_rnn_sizes, _dec_rnn_cfgs, dec_rnn_do) self.pre_softmax = nn.Linear(self.dec.output_size, n_class) pass
def __init__( self, enc_in_size, dec_in_size, dec_out_size, enc_fnn_sizes=[512], enc_fnn_act='LeakyReLU', enc_fnn_do=0.25, enc_rnn_sizes=[256, 256, 256], enc_rnn_cfgs={ "type": "lstm", "bi": True }, enc_rnn_do=0.25, downsampling=[False, True, True], dec_emb_size=256, dec_emb_do=0.25, dec_emb_tied_weight=True, # tying weight from char/word embedding with softmax layer dec_rnn_sizes=[512, 512], dec_rnn_cfgs={"type": "lstm"}, dec_rnn_do=0.25, dec_cfg={"type": "standard_decoder"}, att_cfg={"type": "mlp"}, use_layernorm=False, ): super().__init__() self.enc_in_size = enc_in_size self.dec_in_size = dec_in_size self.dec_out_size = dec_out_size self.enc_fnn_sizes = enc_fnn_sizes self.enc_fnn_act = enc_fnn_act self.enc_fnn_do = ConfigParser.list_parser(enc_fnn_do, len(enc_fnn_sizes)) self.enc_rnn_sizes = enc_rnn_sizes self.enc_rnn_cfgs = enc_rnn_cfgs self.enc_rnn_do = ConfigParser.list_parser(enc_rnn_do, len(enc_rnn_sizes)) self.downsampling = ConfigParser.list_parser(downsampling, len(enc_rnn_sizes)) self.dec_emb_size = dec_emb_size self.dec_emb_do = dec_emb_do self.dec_emb_tied_weight = dec_emb_tied_weight self.dec_rnn_sizes = dec_rnn_sizes self.dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) self.dec_rnn_do = ConfigParser.list_parser(dec_rnn_do, len(dec_rnn_sizes)) self.dec_cfg = dec_cfg self.att_cfg = att_cfg self.use_layernorm = use_layernorm if self.use_layernorm == True: raise ValueError("LayerNorm is not implemented yet") # modules # # init encoder # prev_size = enc_in_size _tmp = [] for ii in range(len(enc_fnn_sizes)): _tmp.append(nn.Linear(prev_size, enc_fnn_sizes[ii])) if use_layernorm: _tmp.append(LayerNorm(enc_fnn_sizes[ii])) _tmp.append(generator_act_module(enc_fnn_act)) _tmp.append(nn.Dropout(p=self.enc_fnn_do[ii])) prev_size = enc_fnn_sizes[ii] self.enc_fnn_lyr = nn.Sequential(*_tmp) self.enc_rnn_lyr = nn.ModuleList() _enc_rnn_cfgs = ConfigParser.list_parser(enc_rnn_cfgs, len(enc_rnn_sizes)) for ii in range(len(enc_rnn_sizes)): _rnn_cfg = {} _rnn_cfg['type'] = _enc_rnn_cfgs[ii]['type'] _rnn_cfg['args'] = [ prev_size, enc_rnn_sizes[ii], 1, True, True, 0, _enc_rnn_cfgs[ii]['bi'] ] self.enc_rnn_lyr.append(generator_rnn(_rnn_cfg)) prev_size = enc_rnn_sizes[ii] * (2 if _enc_rnn_cfgs[ii]['bi'] else 1) final_enc_size = prev_size # init decoder # self.dec_emb_lyr = nn.Embedding(self.dec_in_size, dec_emb_size, padding_idx=None) prev_size = dec_emb_size _dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) for ii in range(len(dec_rnn_sizes)): _type = _dec_rnn_cfgs[ii]['type'] if re.match('stateful.*cell', _type) is None: _dec_rnn_cfgs[ii]['type'] = 'stateful_{}cell'.format(_type) # TODO : dec_cfg # assert 'type' in dec_cfg, "decoder type need to be defined" if dec_cfg['type'] == 'standard_decoder': _tmp_dec_cfg = dict(dec_cfg) del _tmp_dec_cfg['type'] # self.dec_att_lyr = decoder.StandardDecoder(att_cfg=att_cfg, ctx_size=final_enc_size, in_size=dec_emb_size, rnn_sizes=dec_rnn_sizes, rnn_cfgs=_dec_rnn_cfgs, rnn_do=dec_rnn_do, **_tmp_dec_cfg) else: raise NotImplementedError("decoder type {} is not found".format( dec_cfg['type'])) self.dec_presoftmax_lyr = nn.Linear(self.dec_att_lyr.output_size, dec_out_size) if dec_emb_tied_weight: assert dec_out_size == dec_in_size and self.dec_emb_lyr.embedding_dim == self.dec_presoftmax_lyr.in_features self.dec_presoftmax_lyr.weight = self.dec_emb_lyr.weight pass
def __init__( self, enc_in_size, dec_in_size, dec_out_size, dec_out_post_size, enc_emb_size=256, enc_emb_do=0.0, enc_prenet_size=[256, 128], enc_prenet_do=[0.5, 0.5], enc_prenet_fn='leaky_relu', dec_prenet_size=[256, 128], dec_prenet_do=[0.5, 0.5], dec_prenet_fn='leaky_relu', dec_rnn_sizes=[256, 256], dec_rnn_cfgs={"type": "lstm"}, dec_rnn_do=0.0, dec_cfg={"type": "standard_decoder"}, att_cfg={"type": "mlp"}, # CBHG # enc_cbhg_cfg={}, dec_postnet_cbhg_cfg={}, # OPTIONAL # dec_in_range=None): """ Args: enc_in_size : size of vocab dec_in_size : input (mel) dim size dec_out_size : output (mel) dim size (usually same as dec_in_size) dec_out_post_size : output (linear) dim size dec_in_range : pair of integer [x, y] \in [0, dec_in_size], all dims outside this pair will be masked as 0 in Tacotron paper, they only use last time-step instead of all group """ super(TACOTRON, self).__init__() self.enc_in_size = enc_in_size self.dec_in_size = dec_in_size self.dec_out_size = dec_out_size # first output -> mel spec # self.dec_out_post_size = dec_out_post_size # second output -> raw spec # self.enc_emb_size = enc_emb_size self.enc_emb_do = enc_emb_do self.enc_prenet_size = enc_prenet_size self.enc_prenet_do = ConfigParser.list_parser(enc_prenet_do, len(enc_prenet_size)) self.enc_prenet_fn = enc_prenet_fn self.dec_prenet_size = dec_prenet_size self.dec_prenet_do = ConfigParser.list_parser(dec_prenet_do, len(dec_prenet_size)) self.dec_prenet_fn = dec_prenet_fn self.dec_rnn_sizes = dec_rnn_sizes self.dec_rnn_cfgs = dec_rnn_cfgs self.dec_rnn_do = dec_rnn_do self.dec_cfg = dec_cfg self.att_cfg = att_cfg # OPTIONAL # self.dec_in_range = dec_in_range if self.dec_in_range is not None: assert isinstance(self.dec_in_range, (list, tuple)) \ and len(self.dec_in_range) == 2 # CBHG config # self.enc_cbhg_cfg = ConfigParser.item_parser(enc_cbhg_cfg) self.dec_postnet_cbhg_cfg = ConfigParser.item_parser( dec_postnet_cbhg_cfg) self.enc_emb_lyr = nn.Embedding(enc_in_size, enc_emb_size) # init enc prenet # self.enc_prenet_lyr = nn.ModuleList() prev_size = enc_emb_size for ii in range(len(self.enc_prenet_size)): self.enc_prenet_lyr.append( nn.Linear(prev_size, enc_prenet_size[ii])) prev_size = enc_prenet_size[ii] # init enc middle # self.enc_core_lyr = cbhg.CBHG1d(prev_size, **enc_cbhg_cfg) # init dec prenet # self.dec_prenet_lyr = nn.ModuleList() prev_size = dec_in_size if self.dec_in_range is None else ( (self.dec_in_range[-1] or 0) - (self.dec_in_range[-2] or 0)) for ii in range(len(self.dec_prenet_size)): self.dec_prenet_lyr.append( nn.Linear(prev_size, dec_prenet_size[ii])) prev_size = dec_prenet_size[ii] # init dec rnn # _dec_rnn_cfgs = ConfigParser.list_parser(dec_rnn_cfgs, len(dec_rnn_sizes)) for ii in range(len(dec_rnn_sizes)): _type = _dec_rnn_cfgs[ii]['type'] if re.match('stateful.*cell', _type) is None: _dec_rnn_cfgs[ii]['type'] = 'stateful_{}cell'.format(_type) # TODO : dec_cfg # final_enc_size = self.enc_core_lyr.output_size self.dec_att_lyr = decoder.StandardDecoder(att_cfg, final_enc_size, prev_size, dec_rnn_sizes, dec_rnn_cfgs, dec_rnn_do) # init dec regression melspec # self.dec_first_reg_lyr = nn.Linear(self.dec_att_lyr.output_size, self.dec_out_size) # init dec postnet # self.dec_postnet_lyr = cbhg.CBHG1d( self.dec_out_size, conv_proj_filter=[256, dec_out_size], **dec_postnet_cbhg_cfg) # init dec regression rawspec # self.dec_second_reg_lyr = nn.Linear(self.dec_postnet_lyr.output_size, self.dec_out_post_size) pass
def __init__(self, att_cfg, ctx_size, in_size, rnn_sizes=[512, 512], rnn_cfgs={'type':'stateful_lstmcell'}, rnn_do=0.25, ctx_proj_size=256, ctx_proj_fn='tanh', scale_src=1.0, scale_tgt=1.0, att_nth_layer=-1, input_feed=0, ctx_gate=None) : """ ctx_proj_size : projection layer after context vector att_nth_layer : attach attention layer on n-th RNN layer input_feed : input feeding strategy (see Effective NMT) scale_src : scaling expected context vector before concat scale_tgt : scaling RNN hidden vector before concat """ super().__init__() self.rnn_cfgs = rnn_cfgs self.rnn_sizes = rnn_sizes self.rnn_cfgs = rnn_cfgs self.rnn_do = ConfigParser.list_parser(rnn_do, len(rnn_sizes)) self.ctx_proj_size = ctx_proj_size self.ctx_proj_fn = ctx_proj_fn assert input_feed >= 0 and input_feed < len(rnn_sizes) self.input_feed = input_feed if att_nth_layer >= 0 : self.att_nth_layer = att_nth_layer elif att_nth_layer < 0 : self.att_nth_layer = len(rnn_sizes) + att_nth_layer assert self.att_nth_layer >= 0 self.scale_src = scale_src self.scale_tgt = scale_tgt assert ctx_gate in [None, 'all', 'src', 'tgt'] self.ctx_gate = ctx_gate rnn_cfgs = ConfigParser.list_parser(rnn_cfgs, len(rnn_sizes)) assert att_nth_layer >=1 or att_nth_layer <= -1 self.stack_rnn_lyr = nn.ModuleList() prev_size = in_size for ii in range(len(rnn_sizes)) : prev_size += (ctx_proj_size if input_feed == ii else 0) rnn_cfg = rnn_cfgs[ii] rnn_cfg['args'] = [prev_size, rnn_sizes[ii]] _rnn_layer = generator_rnn(rnn_cfg) assert isinstance(_rnn_layer, StatefulBaseCell), "decoder can only use StatefulBaseCell layer" self.stack_rnn_lyr.append(_rnn_layer) prev_size = rnn_sizes[ii] if self.att_nth_layer == ii : # init attention # att_cfg['args'] = [ctx_size, rnn_sizes[self.att_nth_layer]] self.att_layer = generator_attention(att_cfg) if self.ctx_gate is None : # if ctx_gate is None, we just need to create simple projection layer self.ctx_proj_lyr = nn.Linear(rnn_sizes[ii]+self.att_layer.out_features, ctx_proj_size) prev_size = ctx_proj_size pass self.output_size = prev_size # additional : context gate (scaling information from source & target w/ non-linear proj) if self.ctx_gate is not None : self.ctx_gate_lyr = ContextGate(self.ctx_gate, self.att_layer.out_features, self.rnn_sizes[self.att_nth_layer], ctx_proj_size) # TODO : remove output_size argument # self.out_features = prev_size self.reset()
def __init__( self, in_size, n_class, fnn_sizes, rnn_sizes, do_fnn=0.0, do_rnn=0.0, downsampling=None, fnn_act='tanh', rnn_cfgs='{"type":"lstm", "bi":true}', ivec_cfg={"type": "concat"}, ivec_dim=100, use_pack=False, train=True, ): super(FNN_RNN_IVEC_CTC, self).__init__() self.in_size = in_size self.n_class = n_class self.fnn_sizes = fnn_sizes self.rnn_sizes = rnn_sizes self.downsampling = downsampling self.do_fnn = do_fnn if isinstance(do_fnn, list) and len( do_fnn) == len(fnn_sizes) else list(do_fnn) * len(self.fnn_sizes) self.do_rnn = do_rnn if isinstance(do_rnn, list) and len( do_rnn) == len(rnn_sizes) else list(do_rnn) * len(self.rnn_sizes) self.fnn_act = fnn_act self.rnn_cfgs = rnn_cfgs self.ivec_cfg = ivec_cfg self.ivec_dim = ivec_dim self.use_pack = use_pack # modules # self.fnn = nn.ModuleList() prev_size = in_size for ii in range(len(fnn_sizes)): self.fnn.append(nn.Linear(prev_size, fnn_sizes[ii])) prev_size = fnn_sizes[ii] self.rnn = nn.ModuleList() _rnn_cfgs = ConfigParser.list_parser(rnn_cfgs, len(rnn_sizes)) for ii in range(len(rnn_sizes)): _rnn_cfg = {} _rnn_cfg['type'] = _rnn_cfgs[ii]['type'] _rnn_cfg['args'] = [ prev_size, rnn_sizes[ii], 1, True, False, 0, _rnn_cfgs[ii]['bi'] ] self.rnn.append(generator_rnn(_rnn_cfg)) prev_size = rnn_sizes[ii] * (2 if _rnn_cfgs[ii]['bi'] else 1) self.pre_softmax = nn.Linear(prev_size, n_class) ### extension for ivec ### if ivec_cfg['type'] == 'concat': self.aug_layer = nn.Linear(ivec_dim, fnn_sizes[0]) elif ivec_cfg['type'] == 'aug_hid': """ main_param : hid x in """ _in_size = in_size _hid_size = fnn_sizes[0] self.aug_layer = nn.Linear(ivec_dim, _hid_size) self.scale = ivec_cfg.get('scale', 1.0) def fn_gen_params(ivec_feat): return self.aug_layer(ivec_feat) def fn_aug_params(main_param, aug_param): return main_param + self.scale * aug_param.t().expand_as( main_param) elif ivec_cfg['type'] == 'aug_in': _in_size = in_size _hid_size = fnn_sizes[0] self.aug_layer = nn.Linear(ivec_dim, _in_size) self.scale = ivec_cfg.get('scale', 1.0) def fn_gen_params(ivec_feat): return self.aug_layer(ivec_feat) def fn_aug_params(main_param, aug_param): return main_param + self.scale * aug_param.expand_as( main_param) elif ivec_cfg['type'] == 'aug_lr': _rank = ivec_cfg.get('rank', 3) _in_size = in_size _hid_size = fnn_sizes[0] self.aug_layer_in2lr = nn.Linear(ivec_dim, _in_size * _rank) self.aug_layer_hid2lr = nn.Linear(ivec_dim, _hid_size * _rank) self.scale = ivec_cfg.get('scale', 1.0) def fn_gen_params(ivec_feat): _mat_in2lr = self.aug_layer_in2lr(ivec_feat) _mat_hid2lr = self.aug_layer_hid2lr(ivec_feat) _mat_in2lr = _mat_in2lr.view(_rank, _in_size) _mat_hid2lr = _mat_hid2lr.view(_hid_size, _rank) _mat_in2hid = torch.mm(_mat_hid2lr, _mat_in2lr) return _mat_in2hid def fn_aug_params(main_param, aug_param): return main_param + self.scale * aug_param else: raise NotImplementedError if ivec_cfg['type'] != 'concat': self.fn_gen_params = fn_gen_params self.fn_aug_params = fn_aug_params pass