def __init__(self, encoder_num_units, decoder_num_units, attention_type, attention_dim, use_cuda, sharpening_factor=1, sigmoid_smoothing=False, out_channels=10, kernel_size=201, num_heads=1): super(AttentionMechanism, self).__init__() self.attention_type = attention_type self.attention_dim = attention_dim self.use_cuda = use_cuda self.sharpening_factor = sharpening_factor self.sigmoid_smoothing = sigmoid_smoothing self.num_heads = num_heads # Multi-head attention if num_heads > 1: setattr( self, 'W_mha', LinearND(encoder_num_units * num_heads, encoder_num_units, use_cuda=use_cuda)) with self.init_scope(): for h in range(num_heads): if self.attention_type == 'content': setattr( self, 'W_enc_head' + str(h), LinearND(encoder_num_units, attention_dim, bias=True, use_cuda=use_cuda)) setattr( self, 'W_dec_head' + str(h), LinearND(decoder_num_units, attention_dim, bias=False, use_cuda=use_cuda)) setattr( self, 'V_head' + str(h), LinearND(attention_dim, 1, bias=False, use_cuda=use_cuda)) elif self.attention_type == 'location': assert kernel_size % 2 == 1 setattr( self, 'W_enc_head' + str(h), LinearND(encoder_num_units, attention_dim, bias=True, use_cuda=use_cuda)) setattr( self, 'W_dec_head' + str(h), LinearND(decoder_num_units, attention_dim, bias=False, use_cuda=use_cuda)) setattr( self, 'W_conv_head' + str(h), LinearND(out_channels, attention_dim, bias=False, use_cuda=use_cuda)) # setattr(self, 'conv_head' + str(h), # L.ConvolutionND(ndim=1, # in_channels=1, # out_channels=out_channels, # ksize=kernel_size, # stride=1, # pad=kernel_size // 2, # nobias=True, # initialW=None, # initial_bias=None)) setattr( self, 'conv_head' + str(h), L.Convolution2D(in_channels=1, out_channels=out_channels, ksize=(1, kernel_size), stride=1, pad=(0, kernel_size // 2), nobias=True, initialW=None, initial_bias=None)) setattr( self, 'V_head' + str(h), LinearND(attention_dim, 1, bias=False, use_cuda=use_cuda)) elif self.attention_type == 'dot_product': setattr( self, 'W_enc_head' + str(h), LinearND(encoder_num_units, decoder_num_units, bias=False, use_cuda=use_cuda)) elif self.attention_type == 'rnn_attention': raise NotImplementedError elif self.attention_type == 'coverage': raise NotImplementedError else: raise TypeError( "attention_type should be one of [%s], you provided %s." % (", ".join(ATTENTION_TYPE), attention_type)) if use_cuda: for c in self.children(): c.to_gpu()
def __init__(self, input_size, rnn_type, bidirectional, num_units, num_proj, num_layers, dropout_input, dropout_hidden, subsample_list=[], subsample_type='drop', use_cuda=False, merge_bidirectional=False, num_stack=1, splice=1, input_channel=1, conv_channels=[], conv_kernel_sizes=[], conv_strides=[], poolings=[], activation='relu', batch_norm=False, residual=False, dense_residual=False, num_layers_sub=0): super(RNNEncoder, self).__init__() if len(subsample_list) > 0 and len(subsample_list) != num_layers: raise ValueError( 'subsample_list must be the same size as num_layers.') if subsample_type not in ['drop', 'concat']: raise TypeError('subsample_type must be "drop" or "concat".') if num_layers_sub < 0 or (num_layers_sub > 1 and num_layers < num_layers_sub): raise ValueError('Set num_layers_sub between 1 to num_layers.') self.rnn_type = rnn_type self.bidirectional = bidirectional self.num_directions = 2 if bidirectional else 1 self.num_units = num_units self.num_proj = num_proj if num_proj is not None else 0 self.num_layers = num_layers self.dropout_input = dropout_input self.dropout_hidden = dropout_hidden self.merge_bidirectional = merge_bidirectional self.use_cuda = use_cuda # TODO: self.clip_activation = clip_activation # Setting for hierarchical encoder self.num_layers_sub = num_layers_sub # Setting for subsampling if len(subsample_list) == 0: self.subsample_list = [False] * num_layers else: self.subsample_list = subsample_list self.subsample_type = subsample_type # This implementation is bases on # https://arxiv.org/abs/1508.01211 # Chan, William, et al. "Listen, attend and spell." # arXiv preprint arXiv:1508.01211 (2015). # Setting for residual connection assert not (residual and dense_residual) self.residual = residual self.dense_residual = dense_residual subsample_last_layer = 0 for l_reverse, is_subsample in enumerate(subsample_list[::-1]): if is_subsample: subsample_last_layer = num_layers - l_reverse break self.residual_start_layer = subsample_last_layer + 1 # NOTE: residual connection starts from the last subsampling layer with self.init_scope(): # Setting for CNNs before RNNs# Setting for CNNs before RNNs if len(conv_channels) > 0 and len(conv_channels) == len( conv_kernel_sizes) and len(conv_kernel_sizes) == len( conv_strides): assert num_stack == 1 and splice == 1 self.conv = CNNEncoder(input_size, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, dropout_input=0, dropout_hidden=dropout_hidden, activation=activation, use_cuda=use_cuda, batch_norm=batch_norm) input_size = self.conv.output_size else: input_size = input_size * splice * num_stack self.conv = None self.rnns = [] self.projections = [] for l in range(num_layers): if l == 0: encoder_input_size = input_size elif self.num_proj > 0: encoder_input_size = num_proj if subsample_type == 'concat' and l > 0 and self.subsample_list[ l - 1]: encoder_input_size *= 2 else: encoder_input_size = num_units * self.num_directions if subsample_type == 'concat' and l > 0 and self.subsample_list[ l - 1]: encoder_input_size *= 2 if rnn_type == 'lstm': if bidirectional: rnn_i = L.NStepBiLSTM(n_layers=1, in_size=encoder_input_size, out_size=num_units, dropout=0) else: rnn_i = L.NStepLSTM(n_layers=1, in_size=encoder_input_size, out_size=num_units, dropout=0) elif rnn_type == 'gru': if bidirectional: rnn_i = L.NStepBiGRU(n_layers=1, in_size=encoder_input_size, out_size=num_units, dropout=0) else: rnn_i = L.NStepGRU(n_layers=1, in_size=encoder_input_size, out_size=num_units, dropout=0) elif rnn_type == 'rnn': if bidirectional: # rnn_i = L.NStepBiRNNReLU( rnn_i = L.NStepBiRNNTanh(n_layers=1, in_size=encoder_input_size, out_size=num_units, dropout=0) else: # rnn_i = L.NStepRNNReLU( rnn_i = L.NStepRNNTanh(n_layers=1, in_size=encoder_input_size, out_size=num_units, dropout=0) else: raise ValueError( 'rnn_type must be "lstm" or "gru" or "rnn".') if use_cuda: rnn_i.to_gpu() setattr(self, rnn_type + '_l' + str(l), rnn_i) if l != self.num_layers - 1 and self.num_proj > 0: proj_i = LinearND(num_units * self.num_directions, num_proj, dropout=dropout_hidden, use_cuda=use_cuda) if use_cuda: proj_i.to_gpu() setattr(self, 'proj_l' + str(l), proj_i)
def __init__(self, input_size, encoder_type, encoder_bidirectional, encoder_num_units, encoder_num_proj, encoder_num_layers, fc_list, dropout_input, dropout_encoder, num_classes, parameter_init_distribution='uniform', parameter_init=0.1, recurrent_weight_orthogonal=False, init_forget_gate_bias_with_one=True, subsample_list=[], subsample_type='drop', logits_temperature=1, num_stack=1, splice=1, input_channel=1, conv_channels=[], conv_kernel_sizes=[], conv_strides=[], poolings=[], activation='relu', batch_norm=False, label_smoothing_prob=0, weight_noise_std=0, encoder_residual=False, encoder_dense_residual=False): super(ModelBase, self).__init__() self.model_type = 'ctc' # Setting for the encoder self.input_size = input_size self.num_stack = num_stack self.encoder_type = encoder_type self.encoder_num_units = encoder_num_units if encoder_bidirectional: self.encoder_num_units *= 2 self.fc_list = fc_list self.subsample_list = subsample_list self.batch_norm = batch_norm # Setting for CTC self.num_classes = num_classes + 1 # Add the blank class self.logits_temperature = logits_temperature # Setting for regualarization self.weight_noise_injection = False self.weight_noise_std = float(weight_noise_std) self.ls_prob = label_smoothing_prob with self.init_scope(): # Load the encoder if encoder_type in ['lstm', 'gru', 'rnn']: self.encoder = load(encoder_type=encoder_type)( input_size=input_size, rnn_type=encoder_type, bidirectional=encoder_bidirectional, num_units=encoder_num_units, num_proj=encoder_num_proj, num_layers=encoder_num_layers, dropout_input=dropout_input, dropout_hidden=dropout_encoder, subsample_list=subsample_list, subsample_type=subsample_type, use_cuda=self.use_cuda, merge_bidirectional=False, num_stack=num_stack, splice=splice, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, activation=activation, batch_norm=batch_norm, residual=encoder_residual, dense_residual=encoder_dense_residual) elif encoder_type == 'cnn': assert num_stack == 1 and splice == 1 self.encoder = load(encoder_type='cnn')( input_size=input_size, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, dropout_input=dropout_input, dropout_hidden=dropout_encoder, use_cuda=self.use_cuda, activation=activation, batch_norm=batch_norm) else: raise NotImplementedError ################################################## # Fully-connected layers ################################################## if len(fc_list) > 0: for i in range(len(fc_list)): if i == 0: if encoder_type == 'cnn': bottle_input_size = self.encoder.output_size else: bottle_input_size = self.encoder_num_units # if batch_norm: # setattr(self, 'bn_fc_0', # L.BatchNormalization(bottle_input_size)) setattr( self, 'fc_0', LinearND(bottle_input_size, fc_list[i], dropout=dropout_encoder, use_cuda=self.use_cuda)) else: # if batch_norm: # setattr(self, 'bn_fc_' + str(i), # L.BatchNormalization(fc_list[i - 1])) setattr( self, 'fc_' + str(i), LinearND(fc_list[i - 1], fc_list[i], dropout=dropout_encoder, use_cuda=self.use_cuda)) # TODO: remove a bias term in the case of batch normalization self.fc_out = LinearND(fc_list[-1], self.num_classes, use_cuda=self.use_cuda) else: self.fc_out = LinearND(self.encoder_num_units, self.num_classes, use_cuda=self.use_cuda) ################################################## # Initialize parameters ################################################## self.init_weights(parameter_init, distribution=parameter_init_distribution, ignore_keys=['bias']) # Initialize all biases with 0 self.init_weights(0, distribution='constant', keys=['bias']) # Recurrent weights are orthogonalized if recurrent_weight_orthogonal and encoder_type != 'cnn': self.init_weights(parameter_init, distribution='orthogonal', keys=[encoder_type, 'weight'], ignore_keys=['bias']) # Initialize bias in forget gate with 1 if init_forget_gate_bias_with_one: self.init_forget_gate_bias_with_one() # Set CTC decoders self._decode_greedy_np = GreedyDecoder(blank_index=0) self._decode_beam_np = BeamSearchDecoder(blank_index=0)
def __init__( self, input_size, encoder_type, encoder_bidirectional, encoder_num_units, encoder_num_proj, encoder_num_layers, encoder_num_layers_sub, # *** fc_list, fc_list_sub, dropout_input, dropout_encoder, main_loss_weight, # *** sub_loss_weight, # *** num_classes, num_classes_sub, # *** parameter_init_distribution='uniform', parameter_init=0.1, recurrent_weight_orthogonal=False, init_forget_gate_bias_with_one=True, subsample_list=[], subsample_type='drop', logits_temperature=1, num_stack=1, splice=1, input_channel=1, conv_channels=[], conv_kernel_sizes=[], conv_strides=[], poolings=[], activation='relu', batch_norm=False, label_smoothing_prob=0, weight_noise_std=0, encoder_residual=False, encoder_dense_residual=False): super(HierarchicalCTC, self).__init__(input_size=input_size, encoder_type=encoder_type, encoder_bidirectional=encoder_bidirectional, encoder_num_units=encoder_num_units, encoder_num_proj=encoder_num_proj, encoder_num_layers=encoder_num_layers, dropout_input=dropout_input, dropout_encoder=dropout_encoder, num_classes=num_classes, parameter_init=parameter_init, subsample_list=subsample_list, subsample_type=subsample_type, fc_list=fc_list, num_stack=num_stack, splice=splice, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, logits_temperature=logits_temperature, batch_norm=batch_norm, label_smoothing_prob=label_smoothing_prob, weight_noise_std=weight_noise_std) self.model_type = 'hierarchical_ctc' # Setting for the encoder self.encoder_num_layers_sub = encoder_num_layers_sub self.fc_list_sub = fc_list_sub # Setting for CTC self.num_classes_sub = num_classes_sub + 1 # Add the blank class # Setting for MTL self.main_loss_weight = main_loss_weight self.sub_loss_weight = sub_loss_weight with self.init_scope(): # Overide delattr(self, 'encoder') # Load the encoder if encoder_type in ['lstm', 'gru', 'rnn']: self.encoder = load(encoder_type=encoder_type)( input_size=input_size, # 120 or 123 rnn_type=encoder_type, bidirectional=encoder_bidirectional, num_units=encoder_num_units, num_proj=encoder_num_proj, num_layers=encoder_num_layers, num_layers_sub=encoder_num_layers_sub, dropout_input=dropout_input, dropout_hidden=dropout_encoder, subsample_list=subsample_list, subsample_type=subsample_type, use_cuda=self.use_cuda, merge_bidirectional=False, num_stack=num_stack, splice=splice, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, activation=activation, batch_norm=batch_norm, residual=encoder_residual, dense_residual=encoder_dense_residual) elif encoder_type == 'cnn': assert num_stack == 1 and splice == 1 self.encoder = load(encoder_type='cnn')( input_size=input_size, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, dropout_input=dropout_input, dropout_hidden=dropout_encoder, use_cuda=self.use_cuda, activation=activation, batch_norm=batch_norm) else: raise NotImplementedError ################################################## # Fully-connected layers in the main task ################################################## # Overide delattr(self, 'fc_out') if len(fc_list) > 0: for i in range(len(fc_list)): # Overide delattr(self, 'fc_' + str(i)) if i == 0: if encoder_type == 'cnn': bottle_input_size = self.encoder.output_size else: bottle_input_size = self.encoder_num_units # TODO: add batch norm layers setattr( self, 'fc_' + str(i), LinearND(bottle_input_size, fc_list[i], dropout=dropout_encoder, use_cuda=self.use_cuda)) else: # TODO: add batch norm layers setattr( self, 'fc_' + str(i), LinearND(fc_list[i - 1], fc_list[i], dropout=dropout_encoder, use_cuda=self.use_cuda)) # TODO: remove a bias term in the case of batch normalization self.fc_out = LinearND(fc_list[-1], self.num_classes, use_cuda=self.use_cuda) else: self.fc_out = LinearND(self.encoder_num_units, self.num_classes, use_cuda=self.use_cuda) ################################################## # Fully-connected layers in the sub task ################################################## if len(fc_list_sub) > 0: for i in range(len(fc_list_sub)): if i == 0: if encoder_type == 'cnn': bottle_input_size = self.encoder.output_size else: bottle_input_size = self.encoder_num_units # TODO: add batch norm layers setattr( self, 'fc_sub_' + str(i), LinearND(bottle_input_size, fc_list_sub[i], dropout=dropout_encoder, use_cuda=self.use_cuda)) else: # TODO: add batch norm layers setattr( self, 'fc_sub_' + str(i), LinearND(fc_list_sub[i - 1], fc_list_sub[i], dropout=dropout_encoder, use_cuda=self.use_cuda)) # TODO: remove a bias term in the case of batch normalization self.fc_out_sub = LinearND(fc_list_sub[-1], self.num_classes_sub, use_cuda=self.use_cuda) else: self.fc_out_sub = LinearND(self.encoder_num_units, self.num_classes_sub, use_cuda=self.use_cuda) ################################################## # Initialize parameters ################################################## self.init_weights(parameter_init, distribution=parameter_init_distribution, ignore_keys=['bias']) # Initialize all biases with 0 self.init_weights(0, distribution='constant', keys=['bias']) # Recurrent weights are orthogonalized if recurrent_weight_orthogonal: self.init_weights(parameter_init, distribution='orthogonal', keys=['lstm', 'weight'], ignore_keys=['bias']) # Initialize bias in forget gate with 1 if init_forget_gate_bias_with_one: self.init_forget_gate_bias_with_one()
def __init__( self, input_size, encoder_type, encoder_bidirectional, encoder_num_units, encoder_num_proj, encoder_num_layers, encoder_num_layers_sub, # *** attention_type, attention_dim, decoder_type, decoder_num_units, decoder_num_layers, decoder_num_units_sub, # *** decoder_num_layers_sub, # *** embedding_dim, embedding_dim_sub, # *** dropout_input, dropout_encoder, dropout_decoder, dropout_embedding, main_loss_weight, # *** sub_loss_weight, # *** num_classes, num_classes_sub, # *** parameter_init_distribution='uniform', parameter_init=0.1, recurrent_weight_orthogonal=False, init_forget_gate_bias_with_one=True, subsample_list=[], subsample_type='drop', bridge_layer=False, init_dec_state='first', sharpening_factor=1, logits_temperature=1, sigmoid_smoothing=False, coverage_weight=0, ctc_loss_weight_sub=0, # *** attention_conv_num_channels=10, attention_conv_width=201, num_stack=1, splice=1, input_channel=1, conv_channels=[], conv_kernel_sizes=[], conv_strides=[], poolings=[], activation='relu', batch_norm=False, scheduled_sampling_prob=0, scheduled_sampling_max_step=0, label_smoothing_prob=0, weight_noise_std=0, encoder_residual=False, encoder_dense_residual=False, decoder_residual=False, decoder_dense_residual=False, decoding_order='attend_generate_update', bottleneck_dim=256, bottleneck_dim_sub=256, # *** backward_sub=False, # *** num_heads=1, num_heads_sub=1): # *** super(HierarchicalAttentionSeq2seq, self).__init__( input_size=input_size, encoder_type=encoder_type, encoder_bidirectional=encoder_bidirectional, encoder_num_units=encoder_num_units, encoder_num_proj=encoder_num_proj, encoder_num_layers=encoder_num_layers, attention_type=attention_type, attention_dim=attention_dim, decoder_type=decoder_type, decoder_num_units=decoder_num_units, decoder_num_layers=decoder_num_layers, embedding_dim=embedding_dim, dropout_input=dropout_input, dropout_encoder=dropout_encoder, dropout_decoder=dropout_decoder, dropout_embedding=dropout_embedding, num_classes=num_classes, parameter_init=parameter_init, subsample_list=subsample_list, subsample_type=subsample_type, bridge_layer=bridge_layer, init_dec_state=init_dec_state, sharpening_factor=sharpening_factor, logits_temperature=logits_temperature, sigmoid_smoothing=sigmoid_smoothing, coverage_weight=coverage_weight, ctc_loss_weight=0, attention_conv_num_channels=attention_conv_num_channels, attention_conv_width=attention_conv_width, num_stack=num_stack, splice=splice, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, scheduled_sampling_prob=scheduled_sampling_prob, scheduled_sampling_max_step=scheduled_sampling_max_step, label_smoothing_prob=label_smoothing_prob, weight_noise_std=weight_noise_std, encoder_residual=encoder_residual, encoder_dense_residual=encoder_dense_residual, decoder_residual=decoder_residual, decoder_dense_residual=decoder_dense_residual, decoding_order=decoding_order, bottleneck_dim=bottleneck_dim, backward_loss_weight=0, num_heads=num_heads) self.model_type = 'hierarchical_attention' # Setting for the encoder self.encoder_num_units_sub = encoder_num_units if encoder_bidirectional: self.encoder_num_units_sub *= 2 # Setting for the decoder in the sub task self.decoder_num_units_1 = decoder_num_units_sub self.decoder_num_layers_1 = decoder_num_layers_sub self.num_classes_sub = num_classes_sub + 1 # Add <EOS> class self.sos_1 = num_classes_sub self.eos_1 = num_classes_sub # NOTE: <SOS> and <EOS> have the same index self.backward_1 = backward_sub # Setting for the decoder initialization in the sub task if backward_sub: if init_dec_state == 'first': self.init_dec_state_1_bwd = 'final' elif init_dec_state == 'final': self.init_dec_state_1_bwd = 'first' else: self.init_dec_state_1_bwd = init_dec_state if encoder_type != decoder_type: self.init_dec_state_1_bwd = 'zero' else: self.init_dec_state_1_fwd = init_dec_state if encoder_type != decoder_type: self.init_dec_state_1_fwd = 'zero' # Setting for the attention in the sub task self.num_heads_1 = num_heads_sub # Setting for MTL self.main_loss_weight = main_loss_weight self.sub_loss_weight = sub_loss_weight self.ctc_loss_weight_sub = ctc_loss_weight_sub if backward_sub: self.bwd_weight_1 = sub_loss_weight with self.init_scope(): # Overide encoder delattr(self, 'encoder') ############################## # Encoder ############################## if encoder_type in ['lstm', 'gru', 'rnn']: self.encoder = load(encoder_type=encoder_type)( input_size=input_size, rnn_type=encoder_type, bidirectional=encoder_bidirectional, num_units=encoder_num_units, num_proj=encoder_num_proj, num_layers=encoder_num_layers, num_layers_sub=encoder_num_layers_sub, dropout_input=dropout_input, dropout_hidden=dropout_encoder, subsample_list=subsample_list, subsample_type=subsample_type, use_cuda=self.use_cuda, merge_bidirectional=False, num_stack=num_stack, splice=splice, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, activation=activation, batch_norm=batch_norm, residual=encoder_residual, dense_residual=encoder_dense_residual) elif encoder_type == 'cnn': assert num_stack == 1 and splice == 1 self.encoder = load(encoder_type='cnn')( input_size=input_size, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, dropout_input=dropout_input, dropout_hidden=dropout_encoder, use_cuda=self.use_cuda, activation=activation, batch_norm=batch_norm) self.init_dec_state_0 = 'zero' self.init_dec_state_1 = 'zero' else: raise NotImplementedError dir = 'bwd' if backward_sub else 'fwd' self.is_bridge_sub = False if self.sub_loss_weight > 0: ################################################## # Bridge layer between the encoder and decoder ################################################## if encoder_type == 'cnn': self.bridge_1 = LinearND(self.encoder.output_size, decoder_num_units_sub, dropout=dropout_encoder, use_cuda=self.use_cuda) self.encoder_num_units_sub = decoder_num_units_sub self.is_bridge_sub = True elif bridge_layer: self.bridge_1 = LinearND(self.encoder_num_units_sub, decoder_num_units_sub, dropout=dropout_encoder, use_cuda=self.use_cuda) self.encoder_num_units_sub = decoder_num_units_sub self.is_bridge_sub = True else: self.is_bridge_sub = False ################################################## # Initialization of the decoder ################################################## if getattr(self, 'init_dec_state_1_' + dir) != 'zero': setattr( self, 'W_dec_init_1_' + dir, LinearND(self.encoder_num_units_sub, decoder_num_units_sub, use_cuda=self.use_cuda)) ############################## # Decoder (sub) ############################## if decoding_order == 'conditional': setattr( self, 'decoder_first_1_' + dir, RNNDecoder(input_size=embedding_dim_sub, rnn_type=decoder_type, num_units=decoder_num_units_sub, num_layers=1, dropout=dropout_decoder, use_cuda=self.use_cuda, residual=False, dense_residual=False)) setattr( self, 'decoder_second_1_' + dir, RNNDecoder(input_size=self.encoder_num_units_sub, rnn_type=decoder_type, num_units=decoder_num_units_sub, num_layers=1, dropout=dropout_decoder, use_cuda=self.use_cuda, residual=False, dense_residual=False)) # NOTE; the conditional decoder only supports the 1 layer else: setattr( self, 'decoder_1_' + dir, RNNDecoder(input_size=self.encoder_num_units_sub + embedding_dim_sub, rnn_type=decoder_type, num_units=decoder_num_units_sub, num_layers=decoder_num_layers_sub, dropout=dropout_decoder, use_cuda=self.use_cuda, residual=decoder_residual, dense_residual=decoder_dense_residual)) ################################### # Attention layer (sub) ################################### setattr( self, 'attend_1_' + dir, AttentionMechanism( encoder_num_units=self.encoder_num_units_sub, decoder_num_units=decoder_num_units_sub, attention_type=attention_type, attention_dim=attention_dim, use_cuda=self.use_cuda, sharpening_factor=sharpening_factor, sigmoid_smoothing=sigmoid_smoothing, out_channels=attention_conv_num_channels, kernel_size=attention_conv_width, num_heads=num_heads_sub)) ############################## # Output layer (sub) ############################## setattr( self, 'W_d_1_' + dir, LinearND(decoder_num_units_sub, bottleneck_dim_sub, dropout=dropout_decoder, use_cuda=self.use_cuda)) setattr( self, 'W_c_1_' + dir, LinearND(self.encoder_num_units_sub, bottleneck_dim_sub, dropout=dropout_decoder, use_cuda=self.use_cuda)) setattr( self, 'fc_1_' + dir, LinearND(bottleneck_dim_sub, self.num_classes_sub, use_cuda=self.use_cuda)) ############################## # Embedding (sub) ############################## if label_smoothing_prob > 0: self.embed_1 = Embedding_LS( num_classes=self.num_classes_sub, embedding_dim=embedding_dim_sub, dropout=dropout_embedding, label_smoothing_prob=label_smoothing_prob, use_cuda=self.use_cuda) else: self.embed_1 = Embedding(num_classes=self.num_classes_sub, embedding_dim=embedding_dim_sub, dropout=dropout_embedding, ignore_index=self.sos_sub, use_cuda=self.use_cuda) ############################## # CTC (sub) ############################## if ctc_loss_weight_sub > 0: self.fc_ctc_1 = LinearND(self.encoder_num_units_sub, num_classes_sub + 1, use_cuda=self.use_cuda) # Set CTC decoders self._decode_ctc_greedy_np = GreedyDecoder(blank_index=0) self._decode_ctc_beam_np = BeamSearchDecoder(blank_index=0) ################################################## # Initialize parameters ################################################## self.init_weights(parameter_init, distribution=parameter_init_distribution, ignore_keys=['bias']) # Initialize all biases with 0 self.init_weights(0, distribution='constant', keys=['bias']) # Recurrent weights are orthogonalized if recurrent_weight_orthogonal: self.init_weights(parameter_init, distribution='orthogonal', keys=[encoder_type, 'weight'], ignore_keys=['bias']) self.init_weights(parameter_init, distribution='orthogonal', keys=[decoder_type, 'weight'], ignore_keys=['bias']) # Initialize bias in forget gate with 1 if init_forget_gate_bias_with_one: self.init_forget_gate_bias_with_one()