Ejemplo n.º 1
0
    def compile(self,
                feature_shape=(None, 200),
                label_max_string_length=64,
                ms_output_size=1438,
                h_dim=128):

        ipt = Input(shape=feature_shape, name="mfcc_input")

        out = self.wave_conv1d_block(ipt, filters=h_dim)
        skip = None
        for _ in range(3):
            for rate in [1, 2, 4, 8, 16]:
                out, s = self.wave_residual_block(out,
                                                  kernal_size=7,
                                                  rate=rate,
                                                  filters=h_dim)
                if skip is None:
                    skip = s
                else:
                    skip = Add()([skip, s])

        logit = self.wave_conv1d_block(skip, filters=64)
        y_pred = self.wave_conv1d_block(
            logit,
            filters=ms_output_size,
            bias=True,
            activation="softmax",
        )

        label_ipt = Input(name='label_inputs',
                          shape=[label_max_string_length],
                          dtype='float32')
        audio_length = Input(name='audio_length', shape=[1], dtype='int64')
        label_length = Input(name='label_length', shape=[1], dtype='int64')

        loss_out = CTC_Batch_Cost()(
            [label_ipt, y_pred, audio_length, label_length])

        train_model = Model([ipt, label_ipt, audio_length, label_length],
                            [loss_out])
        train_model.compile(optimizer="adam",
                            loss={
                                "ctc": lambda y_true, y_pred: y_pred
                            })

        base_model = Model(ipt, y_pred)

        self.built(train_model, base_model)
Ejemplo n.º 2
0
    def compile(self,
                feature_shape=(1024, 200),
                label_max_string_length=32,
                ms_output_size=1423):
        audio_ipt = Input(name="audio_input", shape=feature_shape)

        parent_out = self.cnn1d_cell(32, audio_ipt, pool=True)
        parent_out = self.cnn1d_cell(64, parent_out, pool=True)
        parent_out = self.cnn1d_cell(64, parent_out, pool=True)

        layer_h1 = self.conv1d_layers(parent_out, 64, 8)
        layer_h2 = self.cnn1d_cell(64, layer_h1, pool=False)
        layer_h3 = Add()([parent_out, layer_h2])

        # 64print(layer_h5)
        layer_h6 = Dropout(0.2)(layer_h3)  # KL,双Dense
        layer_h7 = Dense(256,
                         activation="relu",
                         kernel_initializer="he_normal")(
                             layer_h6)  # TODO 考虑在这里加Attention
        layer_h7 = Dropout(0.2)(layer_h7)
        layer_h8 = Dense(ms_output_size)(layer_h7)
        y_pred = Activation(activation="softmax")(layer_h8)

        y_true = Input(name='label_inputs',
                       shape=[label_max_string_length],
                       dtype='float32')
        audio_length = Input(name='audio_length', shape=[1], dtype='int64')
        label_length = Input(name='label_length', shape=[1], dtype='int64')

        loss_out = CTC_Batch_Cost()(
            [y_true, y_pred, audio_length, label_length])
        train_model = Model([audio_ipt, y_true, audio_length, label_length],
                            [loss_out])
        train_model.compile(optimizer="adam",
                            loss={
                                "ctc": lambda y_true, y_pred: y_pred
                            })

        base_model = Model(audio_ipt, y_pred)

        self.built(train_model, base_model)