def __init__( self, input_size, encoder_type, encoder_bidirectional, encoder_num_units, encoder_num_proj, encoder_num_layers, encoder_num_layers_sub, # *** fc_list, fc_list_sub, dropout_input, dropout_encoder, main_loss_weight, # *** sub_loss_weight, # *** num_classes, num_classes_sub, # *** parameter_init_distribution='uniform', parameter_init=0.1, recurrent_weight_orthogonal=False, init_forget_gate_bias_with_one=True, subsample_list=[], subsample_type='drop', logits_temperature=1, num_stack=1, splice=1, input_channel=1, conv_channels=[], conv_kernel_sizes=[], conv_strides=[], poolings=[], activation='relu', batch_norm=False, label_smoothing_prob=0, weight_noise_std=0, encoder_residual=False, encoder_dense_residual=False): super(HierarchicalCTC, self).__init__(input_size=input_size, encoder_type=encoder_type, encoder_bidirectional=encoder_bidirectional, encoder_num_units=encoder_num_units, encoder_num_proj=encoder_num_proj, encoder_num_layers=encoder_num_layers, dropout_input=dropout_input, dropout_encoder=dropout_encoder, num_classes=num_classes, parameter_init=parameter_init, subsample_list=subsample_list, subsample_type=subsample_type, fc_list=fc_list, num_stack=num_stack, splice=splice, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, logits_temperature=logits_temperature, batch_norm=batch_norm, label_smoothing_prob=label_smoothing_prob, weight_noise_std=weight_noise_std) self.model_type = 'hierarchical_ctc' # Setting for the encoder self.encoder_num_layers_sub = encoder_num_layers_sub self.fc_list_sub = fc_list_sub # Setting for CTC self.num_classes_sub = num_classes_sub + 1 # Add the blank class # Setting for MTL self.main_loss_weight = main_loss_weight self.sub_loss_weight = sub_loss_weight # Load the encoder # NOTE: overide encoder if encoder_type in ['lstm', 'gru', 'rnn']: self.encoder = load(encoder_type=encoder_type)( input_size=input_size, rnn_type=encoder_type, bidirectional=encoder_bidirectional, num_units=encoder_num_units, num_proj=encoder_num_proj, num_layers=encoder_num_layers, num_layers_sub=encoder_num_layers_sub, dropout_input=dropout_input, dropout_hidden=dropout_encoder, subsample_list=subsample_list, subsample_type=subsample_type, batch_first=True, merge_bidirectional=False, pack_sequence=True, num_stack=num_stack, splice=splice, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, activation=activation, batch_norm=batch_norm, residual=encoder_residual, dense_residual=encoder_dense_residual) elif encoder_type == 'cnn': assert num_stack == 1 and splice == 1 self.encoder = load(encoder_type='cnn')( input_size=input_size, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, dropout_input=dropout_input, dropout_hidden=dropout_encoder, activation=activation, batch_norm=batch_norm) else: raise NotImplementedError ################################################## # Fully-connected layers in the main task ################################################## if len(fc_list) > 0: for i in range(len(fc_list)): if i == 0: if encoder_type == 'cnn': bottle_input_size = self.encoder.output_size else: bottle_input_size = self.encoder_num_units # TODO: add batch norm layers setattr( self, 'fc_0', LinearND(bottle_input_size, fc_list[i], dropout=dropout_encoder)) else: # TODO: add batch norm layers setattr( self, 'fc_' + str(i), LinearND(fc_list[i - 1], fc_list[i], dropout=dropout_encoder)) # TODO: remove a bias term in the case of batch normalization self.fc_out = LinearND(fc_list[-1], self.num_classes) else: self.fc_out = LinearND(self.encoder_num_units, self.num_classes) ################################################## # Fully-connected layers in the sub task ################################################## if len(fc_list_sub) > 0: for i in range(len(fc_list_sub)): if i == 0: if encoder_type == 'cnn': bottle_input_size = self.encoder.output_size else: bottle_input_size = self.encoder_num_units # TODO: add batch norm layers setattr( self, 'fc_sub_0', LinearND(bottle_input_size, fc_list_sub[i], dropout=dropout_encoder)) else: # TODO: add batch norm layers setattr( self, 'fc_sub_' + str(i), LinearND(fc_list_sub[i - 1], fc_list_sub[i], dropout=dropout_encoder)) # TODO: remove a bias term in the case of batch normalization self.fc_out_sub = LinearND(fc_list_sub[-1], self.num_classes_sub) else: self.fc_out_sub = LinearND(self.encoder_num_units, self.num_classes_sub) ################################################## # Initialize parameters ################################################## self.init_weights(parameter_init, distribution=parameter_init_distribution, ignore_keys=['bias']) # Initialize all biases with 0 self.init_weights(0, distribution='constant', keys=['bias']) # Recurrent weights are orthogonalized if recurrent_weight_orthogonal: self.init_weights(parameter_init, distribution='orthogonal', keys=['lstm', 'weight'], ignore_keys=['bias']) # Initialize bias in forget gate with 1 if init_forget_gate_bias_with_one: self.init_forget_gate_bias_with_one()
def check(self, encoder_type, bidirectional=False, batch_first=True, subsample_type='concat', conv=False, merge_bidirectional=False, projection=False, residual=False, dense_residual=False): print('==================================================') print(' encoder_type: %s' % encoder_type) print(' bidirectional: %s' % str(bidirectional)) print(' batch_first: %s' % str(batch_first)) print(' subsample_type: %s' % subsample_type) print(' conv: %s' % str(conv)) print(' merge_bidirectional: %s' % str(merge_bidirectional)) print(' projection: %s' % str(projection)) print(' residual: %s' % str(residual)) print(' dense_residual: %s' % str(dense_residual)) print('==================================================') if conv: # pattern 1 # conv_channels = [32, 32] # conv_kernel_sizes = [[41, 11], [21, 11]] # conv_strides = [[2, 2], [2, 1]] # poolings = [[], []] # pattern 2 (VGG like) conv_channels = [64, 64] conv_kernel_sizes = [[3, 3], [3, 3]] conv_strides = [[1, 1], [1, 1]] poolings = [[2, 2], [2, 2]] else: conv_channels = [] conv_kernel_sizes = [] conv_strides = [] poolings = [] # Load batch data batch_size = 4 splice = 1 num_stack = 1 xs, _, x_lens, _ = generate_data(batch_size=batch_size, num_stack=num_stack, splice=splice) # Wrap by Tensor xs = torch.from_numpy(xs) x_lens = torch.from_numpy(x_lens) # Load encoder encoder = load(encoder_type=encoder_type) # Initialize encoder if encoder_type in ['lstm', 'gru', 'rnn']: encoder = encoder( input_size=xs.size(-1) // splice // num_stack, # 120 rnn_type=encoder_type, bidirectional=bidirectional, num_units=256, num_proj=256 if projection else 0, num_layers=6, num_layers_sub=4, dropout_input=0.2, dropout_hidden=0.2, subsample_list=[False, True, True, False, False, False], subsample_type=subsample_type, batch_first=batch_first, merge_bidirectional=merge_bidirectional, splice=splice, num_stack=num_stack, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, batch_norm=True, residual=residual, dense_residual=dense_residual) else: raise NotImplementedError max_time = xs.size(1) if conv: max_time = encoder.conv.get_conv_out_size(max_time, 1) max_time_sub = max_time // \ (2 ** sum(encoder.subsample_list[:encoder.num_layers_sub])) max_time //= (2**sum(encoder.subsample_list)) if subsample_type == 'drop': max_time_sub = math.ceil(max_time_sub) max_time = math.ceil(max_time) elif subsample_type == 'concat': max_time_sub = int(max_time_sub) max_time = int(max_time) outputs, _, outputs_sub, _, perm_indices = encoder(xs, x_lens) print('----- outputs -----') print(xs.size()) print(outputs_sub.size()) print(outputs.size()) num_directions = 2 if bidirectional and not merge_bidirectional else 1 if batch_first: self.assertEqual( (batch_size, max_time_sub, encoder.num_units * num_directions), outputs_sub.size()) self.assertEqual( (batch_size, max_time, encoder.num_units * num_directions), outputs.size()) else: self.assertEqual( (max_time_sub, batch_size, encoder.num_units * num_directions), outputs_sub.size()) self.assertEqual( (max_time, batch_size, encoder.num_units * num_directions), outputs.size())
def __init__(self, input_size, encoder_type, encoder_bidirectional, encoder_num_units, encoder_num_proj, encoder_num_layers, fc_list, dropout_input, dropout_encoder, num_classes, parameter_init_distribution='uniform', parameter_init=0.1, recurrent_weight_orthogonal=False, init_forget_gate_bias_with_one=True, subsample_list=[], subsample_type='drop', logits_temperature=1, num_stack=1, splice=1, input_channel=1, conv_channels=[], conv_kernel_sizes=[], conv_strides=[], poolings=[], activation='relu', batch_norm=False, label_smoothing_prob=0, weight_noise_std=0, encoder_residual=False, encoder_dense_residual=False): super(ModelBase, self).__init__() self.model_type = 'ctc' # Setting for the encoder self.input_size = input_size self.num_stack = num_stack self.encoder_type = encoder_type self.encoder_num_units = encoder_num_units if encoder_bidirectional: self.encoder_num_units *= 2 self.fc_list = fc_list self.subsample_list = subsample_list # Setting for CTC self.num_classes = num_classes + 1 # Add the blank class self.logits_temperature = logits_temperature # Setting for regualarization self.weight_noise_injection = False self.weight_noise_std = float(weight_noise_std) self.ls_prob = label_smoothing_prob # Call the encoder function if encoder_type in ['lstm', 'gru', 'rnn']: self.encoder = load(encoder_type=encoder_type)( input_size=input_size, rnn_type=encoder_type, bidirectional=encoder_bidirectional, num_units=encoder_num_units, num_proj=encoder_num_proj, num_layers=encoder_num_layers, dropout_input=dropout_input, dropout_hidden=dropout_encoder, subsample_list=subsample_list, subsample_type=subsample_type, batch_first=True, merge_bidirectional=False, pack_sequence=True, num_stack=num_stack, splice=splice, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, activation=activation, batch_norm=batch_norm, residual=encoder_residual, dense_residual=encoder_dense_residual, nin=0) elif encoder_type == 'cnn': assert num_stack == 1 and splice == 1 self.encoder = load(encoder_type='cnn')( input_size=input_size, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, dropout_input=dropout_input, dropout_hidden=dropout_encoder, activation=activation, batch_norm=batch_norm) else: raise NotImplementedError ################################################## # Fully-connected layers ################################################## if len(fc_list) > 0: for i in range(len(fc_list)): if i == 0: if encoder_type == 'cnn': bottle_input_size = self.encoder.output_size else: bottle_input_size = self.encoder_num_units # if batch_norm: # setattr(self, 'bn_fc_0', nn.BatchNorm1d( # bottle_input_size)) setattr( self, 'fc_0', LinearND(bottle_input_size, fc_list[i], dropout=dropout_encoder)) else: # if batch_norm: # setattr(self, 'fc_bn_' + str(i), # nn.BatchNorm1d(fc_list[i - 1])) setattr( self, 'fc_' + str(i), LinearND(fc_list[i - 1], fc_list[i], dropout=dropout_encoder)) # TODO: remove a bias term in the case of batch normalization self.fc_out = LinearND(fc_list[-1], self.num_classes) else: self.fc_out = LinearND(self.encoder_num_units, self.num_classes) ################################################## # Initialize parameters ################################################## self.init_weights(parameter_init, distribution=parameter_init_distribution, ignore_keys=['bias']) # Initialize all biases with 0 self.init_weights(0, distribution='constant', keys=['bias']) # Recurrent weights are orthogonalized if recurrent_weight_orthogonal and encoder_type != 'cnn': self.init_weights(parameter_init, distribution='orthogonal', keys=[encoder_type, 'weight'], ignore_keys=['bias']) # Initialize bias in forget gate with 1 if init_forget_gate_bias_with_one: self.init_forget_gate_bias_with_one() # Set CTC decoders self._decode_greedy_np = GreedyDecoder(blank_index=0) self._decode_beam_np = BeamSearchDecoder(blank_index=0)
def __init__( self, input_size, encoder_type, encoder_bidirectional, encoder_num_units, encoder_num_proj, encoder_num_layers, encoder_num_layers_sub, # *** attention_type, attention_dim, decoder_type, decoder_num_units, decoder_num_units_sub, # *** decoder_num_layers, decoder_num_layers_sub, # *** embedding_dim, embedding_dim_sub, # *** dropout_input, dropout_encoder, dropout_decoder, dropout_embedding, main_loss_weight, # *** sub_loss_weight, # *** num_classes, num_classes_sub, # *** parameter_init_distribution='uniform', parameter_init=0.1, recurrent_weight_orthogonal=False, init_forget_gate_bias_with_one=True, subsample_list=[], subsample_type='drop', bridge_layer=False, init_dec_state='first', sharpening_factor=1, logits_temperature=1, sigmoid_smoothing=False, coverage_weight=0, ctc_loss_weight_sub=0, # *** attention_conv_num_channels=10, attention_conv_width=201, num_stack=1, splice=1, input_channel=1, conv_channels=[], conv_kernel_sizes=[], conv_strides=[], poolings=[], activation='relu', batch_norm=False, scheduled_sampling_prob=0, scheduled_sampling_max_step=0, label_smoothing_prob=0, weight_noise_std=0, encoder_residual=False, encoder_dense_residual=False, decoder_residual=False, decoder_dense_residual=False, decoding_order='attend_generate_update', bottleneck_dim=256, bottleneck_dim_sub=256, # *** backward_sub=False, # *** num_heads=1, num_heads_sub=1): # *** super(HierarchicalAttentionSeq2seq, self).__init__( input_size=input_size, encoder_type=encoder_type, encoder_bidirectional=encoder_bidirectional, encoder_num_units=encoder_num_units, encoder_num_proj=encoder_num_proj, encoder_num_layers=encoder_num_layers, attention_type=attention_type, attention_dim=attention_dim, decoder_type=decoder_type, decoder_num_units=decoder_num_units, decoder_num_layers=decoder_num_layers, embedding_dim=embedding_dim, dropout_input=dropout_input, dropout_encoder=dropout_encoder, dropout_decoder=dropout_decoder, dropout_embedding=dropout_embedding, num_classes=num_classes, parameter_init=parameter_init, subsample_list=subsample_list, subsample_type=subsample_type, bridge_layer=bridge_layer, init_dec_state=init_dec_state, sharpening_factor=sharpening_factor, logits_temperature=logits_temperature, sigmoid_smoothing=sigmoid_smoothing, coverage_weight=coverage_weight, ctc_loss_weight=0, attention_conv_num_channels=attention_conv_num_channels, attention_conv_width=attention_conv_width, num_stack=num_stack, splice=splice, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, scheduled_sampling_prob=scheduled_sampling_prob, scheduled_sampling_max_step=scheduled_sampling_max_step, label_smoothing_prob=label_smoothing_prob, weight_noise_std=weight_noise_std, encoder_residual=encoder_residual, encoder_dense_residual=encoder_dense_residual, decoder_residual=decoder_residual, decoder_dense_residual=decoder_dense_residual, decoding_order=decoding_order, bottleneck_dim=bottleneck_dim, backward_loss_weight=0, num_heads=num_heads) self.model_type = 'hierarchical_attention' # Setting for the encoder self.encoder_num_units_sub = encoder_num_units if encoder_bidirectional: self.encoder_num_units_sub *= 2 # Setting for the decoder in the sub task self.decoder_num_units_1 = decoder_num_units_sub self.decoder_num_layers_1 = decoder_num_layers_sub self.num_classes_sub = num_classes_sub + 1 # Add <EOS> class self.sos_1 = num_classes_sub self.eos_1 = num_classes_sub # NOTE: <SOS> and <EOS> have the same index self.backward_1 = backward_sub # Setting for the decoder initialization in the sub task if backward_sub: if init_dec_state == 'first': self.init_dec_state_1_bwd = 'final' elif init_dec_state == 'final': self.init_dec_state_1_bwd = 'first' else: self.init_dec_state_1_bwd = init_dec_state if encoder_type != decoder_type: self.init_dec_state_1_bwd = 'zero' else: self.init_dec_state_1_fwd = init_dec_state if encoder_type != decoder_type: self.init_dec_state_1_fwd = 'zero' # Setting for the attention in the sub task self.num_heads_1 = num_heads_sub # Setting for MTL self.main_loss_weight = main_loss_weight self.sub_loss_weight = sub_loss_weight self.ctc_loss_weight_sub = ctc_loss_weight_sub if backward_sub: self.bwd_weight_1 = sub_loss_weight ############################## # Encoder # NOTE: overide encoder ############################## if encoder_type in ['lstm', 'gru', 'rnn']: self.encoder = load(encoder_type=encoder_type)( input_size=input_size, rnn_type=encoder_type, bidirectional=encoder_bidirectional, num_units=encoder_num_units, num_proj=encoder_num_proj, num_layers=encoder_num_layers, num_layers_sub=encoder_num_layers_sub, dropout_input=dropout_input, dropout_hidden=dropout_encoder, subsample_list=subsample_list, subsample_type=subsample_type, batch_first=True, merge_bidirectional=False, pack_sequence=True, num_stack=num_stack, splice=splice, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, activation=activation, batch_norm=batch_norm, residual=encoder_residual, dense_residual=encoder_dense_residual) elif encoder_type == 'cnn': assert num_stack == 1 and splice == 1 self.encoder = load(encoder_type='cnn')( input_size=input_size, input_channel=input_channel, conv_channels=conv_channels, conv_kernel_sizes=conv_kernel_sizes, conv_strides=conv_strides, poolings=poolings, dropout_input=dropout_input, dropout_hidden=dropout_encoder, activation=activation, batch_norm=batch_norm) self.init_dec_state_0 = 'zero' self.init_dec_state_1 = 'zero' else: raise NotImplementedError dir = 'bwd' if backward_sub else 'fwd' self.is_bridge_sub = False if self.sub_loss_weight > 0: ################################################## # Bridge layer between the encoder and decoder ################################################## if encoder_type == 'cnn': self.bridge_1 = LinearND(self.encoder.output_size, decoder_num_units_sub, dropout=dropout_encoder) self.encoder_num_units_sub = decoder_num_units_sub self.is_bridge_sub = True elif bridge_layer: self.bridge_1 = LinearND(self.encoder_num_units_sub, decoder_num_units_sub, dropout=dropout_encoder) self.encoder_num_units_sub = decoder_num_units_sub self.is_bridge_sub = True else: self.is_bridge_sub = False ################################################## # Initialization of the decoder ################################################## if getattr(self, 'init_dec_state_1_' + dir) != 'zero': setattr( self, 'W_dec_init_1_' + dir, LinearND(self.encoder_num_units_sub, decoder_num_units_sub)) ############################## # Decoder (sub) ############################## if decoding_order == 'conditional': setattr( self, 'decoder_first_1_' + dir, RNNDecoder(input_size=embedding_dim_sub, rnn_type=decoder_type, num_units=decoder_num_units_sub, num_layers=1, dropout=dropout_decoder, residual=False, dense_residual=False)) setattr( self, 'decoder_second_1_' + dir, RNNDecoder(input_size=self.encoder_num_units_sub, rnn_type=decoder_type, num_units=decoder_num_units_sub, num_layers=1, dropout=dropout_decoder, residual=False, dense_residual=False)) # NOTE; the conditional decoder only supports the 1 layer else: setattr( self, 'decoder_1_' + dir, RNNDecoder(input_size=self.encoder_num_units_sub + embedding_dim_sub, rnn_type=decoder_type, num_units=decoder_num_units_sub, num_layers=decoder_num_layers_sub, dropout=dropout_decoder, residual=decoder_residual, dense_residual=decoder_dense_residual)) ################################### # Attention layer (sub) ################################### setattr( self, 'attend_1_' + dir, AttentionMechanism( encoder_num_units=self.encoder_num_units_sub, decoder_num_units=decoder_num_units_sub, attention_type=attention_type, attention_dim=attention_dim, sharpening_factor=sharpening_factor, sigmoid_smoothing=sigmoid_smoothing, out_channels=attention_conv_num_channels, kernel_size=attention_conv_width, num_heads=num_heads_sub)) ############################## # Output layer (sub) ############################## setattr( self, 'W_d_1_' + dir, LinearND(decoder_num_units_sub, bottleneck_dim_sub, dropout=dropout_decoder)) setattr( self, 'W_c_1_' + dir, LinearND(self.encoder_num_units_sub, bottleneck_dim_sub, dropout=dropout_decoder)) setattr(self, 'fc_1_' + dir, LinearND(bottleneck_dim_sub, self.num_classes_sub)) ############################## # Embedding (sub) ############################## if label_smoothing_prob > 0: self.embed_1 = Embedding_LS( num_classes=self.num_classes_sub, embedding_dim=embedding_dim_sub, dropout=dropout_embedding, label_smoothing_prob=label_smoothing_prob) else: self.embed_1 = Embedding(num_classes=self.num_classes_sub, embedding_dim=embedding_dim_sub, dropout=dropout_embedding, ignore_index=-1) ############################## # CTC (sub) ############################## if ctc_loss_weight_sub > 0: self.fc_ctc_1 = LinearND(self.encoder_num_units_sub, num_classes_sub + 1) # Set CTC decoders self._decode_ctc_greedy_np = GreedyDecoder(blank_index=0) self._decode_ctc_beam_np = BeamSearchDecoder(blank_index=0) # NOTE: index 0 is reserved for the blank class ################################################## # Initialize parameters ################################################## self.init_weights(parameter_init, distribution=parameter_init_distribution, ignore_keys=['bias']) # Initialize all biases with 0 self.init_weights(0, distribution='constant', keys=['bias']) # Recurrent weights are orthogonalized if recurrent_weight_orthogonal: self.init_weights(parameter_init, distribution='orthogonal', keys=[encoder_type, 'weight'], ignore_keys=['bias']) self.init_weights(parameter_init, distribution='orthogonal', keys=[decoder_type, 'weight'], ignore_keys=['bias']) # Initialize bias in forget gate with 1 if init_forget_gate_bias_with_one: self.init_forget_gate_bias_with_one()