def compile(self, feature_shape=(None, 200), label_max_string_length=64, ms_output_size=1438, h_dim=128): ipt = Input(shape=feature_shape, name="mfcc_input") out = self.wave_conv1d_block(ipt, filters=h_dim) skip = None for _ in range(3): for rate in [1, 2, 4, 8, 16]: out, s = self.wave_residual_block(out, kernal_size=7, rate=rate, filters=h_dim) if skip is None: skip = s else: skip = Add()([skip, s]) logit = self.wave_conv1d_block(skip, filters=64) y_pred = self.wave_conv1d_block( logit, filters=ms_output_size, bias=True, activation="softmax", ) label_ipt = Input(name='label_inputs', shape=[label_max_string_length], dtype='float32') audio_length = Input(name='audio_length', shape=[1], dtype='int64') label_length = Input(name='label_length', shape=[1], dtype='int64') loss_out = CTC_Batch_Cost()( [label_ipt, y_pred, audio_length, label_length]) train_model = Model([ipt, label_ipt, audio_length, label_length], [loss_out]) train_model.compile(optimizer="adam", loss={ "ctc": lambda y_true, y_pred: y_pred }) base_model = Model(ipt, y_pred) self.built(train_model, base_model)
def compile(self, feature_shape=(1024, 200), label_max_string_length=32, ms_output_size=1423): audio_ipt = Input(name="audio_input", shape=feature_shape) parent_out = self.cnn1d_cell(32, audio_ipt, pool=True) parent_out = self.cnn1d_cell(64, parent_out, pool=True) parent_out = self.cnn1d_cell(64, parent_out, pool=True) layer_h1 = self.conv1d_layers(parent_out, 64, 8) layer_h2 = self.cnn1d_cell(64, layer_h1, pool=False) layer_h3 = Add()([parent_out, layer_h2]) # 64print(layer_h5) layer_h6 = Dropout(0.2)(layer_h3) # KL,双Dense layer_h7 = Dense(256, activation="relu", kernel_initializer="he_normal")( layer_h6) # TODO 考虑在这里加Attention layer_h7 = Dropout(0.2)(layer_h7) layer_h8 = Dense(ms_output_size)(layer_h7) y_pred = Activation(activation="softmax")(layer_h8) y_true = Input(name='label_inputs', shape=[label_max_string_length], dtype='float32') audio_length = Input(name='audio_length', shape=[1], dtype='int64') label_length = Input(name='label_length', shape=[1], dtype='int64') loss_out = CTC_Batch_Cost()( [y_true, y_pred, audio_length, label_length]) train_model = Model([audio_ipt, y_true, audio_length, label_length], [loss_out]) train_model.compile(optimizer="adam", loss={ "ctc": lambda y_true, y_pred: y_pred }) base_model = Model(audio_ipt, y_pred) self.built(train_model, base_model)