Esempio n. 1
0
    def __init__(self, input_size, vocab_size, init_adadelta, ctc_weight, encoder, attention, decoder, emb_drop=0.0):
        super(ASR, self).__init__()

        # Setup
        assert 0 <= ctc_weight <= 1
        self.vocab_size = vocab_size
        self.ctc_weight = ctc_weight
        self.enable_ctc = ctc_weight > 0
        self.enable_att = ctc_weight != 1
        self.lm = None

        # Modules
        self.encoder = Encoder(input_size, **encoder) ## NOTE: Encoder Here
        
        if self.enable_ctc: 
            self.ctc_layer = nn.Linear(self.encoder.out_dim, vocab_size) ## why?
        if self.enable_att: 
            self.dec_dim    = decoder['dim']
            self.pre_embed  = nn.Embedding(vocab_size, self.dec_dim)
            self.embed_drop = nn.Dropout(emb_drop)
            self.decoder    = Decoder(
                self.encoder.out_dim+self.dec_dim, vocab_size, **decoder) ## NOTE: Decoder Here
            query_dim = self.dec_dim*self.decoder.layer
            self.attention = Attention(
                self.encoder.out_dim, query_dim, **attention)

        # Init
        if init_adadelta:
            self.apply(init_weights)
            if self.enable_att:
                for l in range(self.decoder.layer):
                    ## LSTM/GRU layer here is a group of N layers of network, so each layer needs to init their weights.

                    bias = getattr(self.decoder.layers, 'bias_ih_l{}'.format(l))
                    bias = init_gate(bias)
Esempio n. 2
0
    def __init__(self, input_size, vocab_size, init_adadelta, ctc_weight, encoder, attention, decoder, emb_drop=0.0):
        super(ASR, self).__init__()

        # Setup
        assert 0 <= ctc_weight <= 1
        self.vocab_size = vocab_size
        self.ctc_weight = ctc_weight
        self.enable_ctc = ctc_weight > 0
        self.enable_att = ctc_weight != 1
        self.lm = None

        # Modules
        self.encoder = Encoder(input_size, **encoder)
        if self.enable_ctc:
            self.ctc_layer = nn.Linear(self.encoder.out_dim, vocab_size)
        if self.enable_att:
            self.dec_dim = decoder['dim']
            self.pre_embed = nn.Embedding(vocab_size, self.dec_dim)
            self.embed_drop = nn.Dropout(emb_drop)
            self.decoder = Decoder(
                self.encoder.out_dim + self.dec_dim, vocab_size, **decoder)
            if self.decoder.decoder_type == 'rnn':
                query_dim = self.dec_dim*self.decoder.layer
                self.attention = Attention(
                    self.encoder.out_dim, query_dim, **attention)
            else:
                self.attention = None

        # Init
        if init_adadelta and self.decoder.decoder_type == 'rnn':
            self.apply(init_weights)
            for l in range(self.decoder.layer):
                bias = getattr(self.decoder.layers, 'bias_ih_l{}'.format(l))
                bias = init_gate(bias)
Esempio n. 3
0
    def __init__(self, input_size, vocab_size, init_adadelta, ctc_weight, encoder, attention, decoder, freeze_weights, emb_drop=0.0):
        super(ASR, self).__init__()

        # Setup
        assert 0 <= ctc_weight <= 1
        self.vocab_size = vocab_size
        self.ctc_weight = ctc_weight
        self.enable_ctc = ctc_weight > 0
        self.enable_att = ctc_weight != 1
        self.lm = None
        self.freeze_weights = freeze_weights
        # Modules
        self.encoder = Encoder(input_size, **encoder)
        print("Encoder model:\n", self.encoder)
        if self.enable_ctc:
            self.ctc_layer = nn.Linear(self.encoder.out_dim, vocab_size)
            print("CTC Model:\n", self.ctc_layer)
        if self.enable_att:
            self.dec_dim = decoder['dim']
            self.pre_embed = nn.Embedding(vocab_size, self.dec_dim)
            self.embed_drop = nn.Dropout(emb_drop)
            self.decoder = Decoder(
                self.encoder.out_dim+self.dec_dim, vocab_size, **decoder)
            print("Decoder model:\n", self.decoder)
            query_dim = self.dec_dim*self.decoder.layer
            self.attention = Attention(
                self.encoder.out_dim, query_dim, **attention)
            print("Attention model:\n", self.attention)

        if len(self.freeze_weights) > 0:
            for param in self.freeze_weights:
                if param == "embed":
                   for s_param in self.pre_embed.parameters():
                        s_param.requires_grad = False
                   print("Embedding layers frozen")
                if param == "encoder":
                   for s_param in self.encoder.parameters():
                        s_param.requires_grad = False
                   print("Encoder layers frozen")
                if param == "decoder":
                   for s_param in self.decoder.parameters():
                        s_param.requires_grad = False
                   print("Decoder layers frozen")



        # Init
        if init_adadelta:
            self.apply(init_weights)
            if self.enable_att:
                for l in range(self.decoder.layer):
                    bias = getattr(self.decoder.layers, 'bias_ih_l{}'.format(l))
                    bias = init_gate(bias)
Esempio n. 4
0
    def __init__(self,
                 input_size,
                 vocab_size,
                 batch_size,
                 ctc_weight,
                 encoder,
                 attention,
                 decoder,
                 emb_drop=0.0,
                 init_adadelta=True):
        super(ASR, self).__init__()

        # Setup
        assert 0 <= ctc_weight <= 1
        self.vocab_size = vocab_size
        self.ctc_weight = ctc_weight
        self.enable_ctc = ctc_weight > 0
        self.enable_att = ctc_weight != 1
        self.lm = None

        # Modules
        self.encoder = Encoder(input_size, batch_size, **encoder)
        if self.enable_ctc:
            self.ctc_layer = nn.Sequential(
                nn.Linear(self.encoder.out_dim, vocab_size), nn.ReLU())
        if self.enable_att:
            self.dec_dim = decoder['dim']
            self.pre_embed = nn.Embedding(vocab_size, self.dec_dim)
            self.embed_drop = nn.Dropout(emb_drop)
            self.decoder = Decoder(batch_size,
                                   self.encoder.out_dim + self.dec_dim,
                                   vocab_size, **decoder)
            query_dim = self.dec_dim * self.decoder.layer
            self.attention = Attention(self.encoder.out_dim, query_dim,
                                       **attention)

        # Init
        """special initialization not work"""
        init_adadelta = True
        if init_adadelta:
            self.apply(init_weights)
            if self.enable_att:
                for l in range(self.decoder.layer):
                    bias = getattr(self.decoder.layers,
                                   'bias_ih_l{}'.format(l))
                    bias = init_gate(bias)

        # Orthogonal weight initialisation
        '''
Esempio n. 5
0
    def __init__(
            self,
            input_size,
            vocab_size,  # 相当于ocr里面的字符集voc
            init_adadelta,
            ctc_weight,
            encoder,
            attention,
            decoder,
            emb_drop=0.0):
        super(ASR, self).__init__()

        # Setup
        assert 0 <= ctc_weight <= 1
        self.vocab_size = vocab_size  # 识别
        self.ctc_weight = ctc_weight  # ctc的权重
        self.enable_ctc = ctc_weight > 0  # bool
        self.enable_att = ctc_weight != 1  # bool
        self.lm = None

        # Modules
        self.encoder = Encoder(input_size, **encoder)
        if self.enable_ctc:  # 开启ctc
            self.ctc_layer = nn.Linear(self.encoder.out_dim, vocab_size)
        if self.enable_att:  # 开启attention
            self.dec_dim = decoder['dim']
            self.pre_embed = nn.Embedding(vocab_size, self.dec_dim)
            self.embed_drop = nn.Dropout(emb_drop)
            self.decoder = Decoder(self.encoder.out_dim + self.dec_dim,
                                   vocab_size, **decoder)
            query_dim = self.dec_dim * self.decoder.layer
            self.attention = Attention(self.encoder.out_dim, query_dim,
                                       **attention)

        # Init
        if init_adadelta:
            self.apply(init_weights)
            if self.enable_att:
                for l in range(self.decoder.layer):
                    bias = getattr(self.decoder.layers,
                                   'bias_ih_l{}'.format(l))
                    bias = init_gate(bias)