def __init__(self, input_size, vocab_size, init_adadelta, ctc_weight, encoder, attention, decoder, emb_drop=0.0): super(ASR, self).__init__() # Setup assert 0 <= ctc_weight <= 1 self.vocab_size = vocab_size self.ctc_weight = ctc_weight self.enable_ctc = ctc_weight > 0 self.enable_att = ctc_weight != 1 self.lm = None # Modules self.encoder = Encoder(input_size, **encoder) ## NOTE: Encoder Here if self.enable_ctc: self.ctc_layer = nn.Linear(self.encoder.out_dim, vocab_size) ## why? if self.enable_att: self.dec_dim = decoder['dim'] self.pre_embed = nn.Embedding(vocab_size, self.dec_dim) self.embed_drop = nn.Dropout(emb_drop) self.decoder = Decoder( self.encoder.out_dim+self.dec_dim, vocab_size, **decoder) ## NOTE: Decoder Here query_dim = self.dec_dim*self.decoder.layer self.attention = Attention( self.encoder.out_dim, query_dim, **attention) # Init if init_adadelta: self.apply(init_weights) if self.enable_att: for l in range(self.decoder.layer): ## LSTM/GRU layer here is a group of N layers of network, so each layer needs to init their weights. bias = getattr(self.decoder.layers, 'bias_ih_l{}'.format(l)) bias = init_gate(bias)
def __init__(self, input_size, vocab_size, init_adadelta, ctc_weight, encoder, attention, decoder, emb_drop=0.0): super(ASR, self).__init__() # Setup assert 0 <= ctc_weight <= 1 self.vocab_size = vocab_size self.ctc_weight = ctc_weight self.enable_ctc = ctc_weight > 0 self.enable_att = ctc_weight != 1 self.lm = None # Modules self.encoder = Encoder(input_size, **encoder) if self.enable_ctc: self.ctc_layer = nn.Linear(self.encoder.out_dim, vocab_size) if self.enable_att: self.dec_dim = decoder['dim'] self.pre_embed = nn.Embedding(vocab_size, self.dec_dim) self.embed_drop = nn.Dropout(emb_drop) self.decoder = Decoder( self.encoder.out_dim + self.dec_dim, vocab_size, **decoder) if self.decoder.decoder_type == 'rnn': query_dim = self.dec_dim*self.decoder.layer self.attention = Attention( self.encoder.out_dim, query_dim, **attention) else: self.attention = None # Init if init_adadelta and self.decoder.decoder_type == 'rnn': self.apply(init_weights) for l in range(self.decoder.layer): bias = getattr(self.decoder.layers, 'bias_ih_l{}'.format(l)) bias = init_gate(bias)
def __init__(self, input_size, vocab_size, init_adadelta, ctc_weight, encoder, attention, decoder, freeze_weights, emb_drop=0.0): super(ASR, self).__init__() # Setup assert 0 <= ctc_weight <= 1 self.vocab_size = vocab_size self.ctc_weight = ctc_weight self.enable_ctc = ctc_weight > 0 self.enable_att = ctc_weight != 1 self.lm = None self.freeze_weights = freeze_weights # Modules self.encoder = Encoder(input_size, **encoder) print("Encoder model:\n", self.encoder) if self.enable_ctc: self.ctc_layer = nn.Linear(self.encoder.out_dim, vocab_size) print("CTC Model:\n", self.ctc_layer) if self.enable_att: self.dec_dim = decoder['dim'] self.pre_embed = nn.Embedding(vocab_size, self.dec_dim) self.embed_drop = nn.Dropout(emb_drop) self.decoder = Decoder( self.encoder.out_dim+self.dec_dim, vocab_size, **decoder) print("Decoder model:\n", self.decoder) query_dim = self.dec_dim*self.decoder.layer self.attention = Attention( self.encoder.out_dim, query_dim, **attention) print("Attention model:\n", self.attention) if len(self.freeze_weights) > 0: for param in self.freeze_weights: if param == "embed": for s_param in self.pre_embed.parameters(): s_param.requires_grad = False print("Embedding layers frozen") if param == "encoder": for s_param in self.encoder.parameters(): s_param.requires_grad = False print("Encoder layers frozen") if param == "decoder": for s_param in self.decoder.parameters(): s_param.requires_grad = False print("Decoder layers frozen") # Init if init_adadelta: self.apply(init_weights) if self.enable_att: for l in range(self.decoder.layer): bias = getattr(self.decoder.layers, 'bias_ih_l{}'.format(l)) bias = init_gate(bias)
def __init__(self, input_size, vocab_size, batch_size, ctc_weight, encoder, attention, decoder, emb_drop=0.0, init_adadelta=True): super(ASR, self).__init__() # Setup assert 0 <= ctc_weight <= 1 self.vocab_size = vocab_size self.ctc_weight = ctc_weight self.enable_ctc = ctc_weight > 0 self.enable_att = ctc_weight != 1 self.lm = None # Modules self.encoder = Encoder(input_size, batch_size, **encoder) if self.enable_ctc: self.ctc_layer = nn.Sequential( nn.Linear(self.encoder.out_dim, vocab_size), nn.ReLU()) if self.enable_att: self.dec_dim = decoder['dim'] self.pre_embed = nn.Embedding(vocab_size, self.dec_dim) self.embed_drop = nn.Dropout(emb_drop) self.decoder = Decoder(batch_size, self.encoder.out_dim + self.dec_dim, vocab_size, **decoder) query_dim = self.dec_dim * self.decoder.layer self.attention = Attention(self.encoder.out_dim, query_dim, **attention) # Init """special initialization not work""" init_adadelta = True if init_adadelta: self.apply(init_weights) if self.enable_att: for l in range(self.decoder.layer): bias = getattr(self.decoder.layers, 'bias_ih_l{}'.format(l)) bias = init_gate(bias) # Orthogonal weight initialisation '''
def __init__( self, input_size, vocab_size, # 相当于ocr里面的字符集voc init_adadelta, ctc_weight, encoder, attention, decoder, emb_drop=0.0): super(ASR, self).__init__() # Setup assert 0 <= ctc_weight <= 1 self.vocab_size = vocab_size # 识别 self.ctc_weight = ctc_weight # ctc的权重 self.enable_ctc = ctc_weight > 0 # bool self.enable_att = ctc_weight != 1 # bool self.lm = None # Modules self.encoder = Encoder(input_size, **encoder) if self.enable_ctc: # 开启ctc self.ctc_layer = nn.Linear(self.encoder.out_dim, vocab_size) if self.enable_att: # 开启attention self.dec_dim = decoder['dim'] self.pre_embed = nn.Embedding(vocab_size, self.dec_dim) self.embed_drop = nn.Dropout(emb_drop) self.decoder = Decoder(self.encoder.out_dim + self.dec_dim, vocab_size, **decoder) query_dim = self.dec_dim * self.decoder.layer self.attention = Attention(self.encoder.out_dim, query_dim, **attention) # Init if init_adadelta: self.apply(init_weights) if self.enable_att: for l in range(self.decoder.layer): bias = getattr(self.decoder.layers, 'bias_ih_l{}'.format(l)) bias = init_gate(bias)