Esempio n. 1
0
    def call(self, x, enc_output, cache, training):
        seq_len = tf.shape(x)[1]
        len_encoded = get_tensor_len(enc_output)
        encoder_padding = tf.equal(tf.sequence_mask(len_encoded, maxlen=tf.shape(enc_output)[1]), False) # bool tensor
        padding_mask = attention_bias_ignore_padding(encoder_padding)
        look_ahead_mask = attention_bias_lower_triangle(tf.shape(x)[1])

        new_cache = []

        x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
        # x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.dec_layers[i](
                x, enc_output, training, look_ahead_mask, padding_mask)

            if cache is not None:
                decoder_output = tf.concat([cache[:, :, i, :], x], axis=1)
                new_cache.append(decoder_output[:, :, None, :])

        # x.shape == (batch_size, target_seq_len, d_model)
        if cache is not None:
            new_cache = tf.concat(new_cache, axis=2)
            return x, new_cache
        else:
            return x
Esempio n. 2
0
def evaluate(feature, dataset, dev_size, encoder, decoder):
    num_processed = 0
    total_cer_dist = 0
    total_cer_len = 0
    total_res_len = 0
    for batch in feature:
        uttids, x = batch
        # preds = forward(x, model)
        encoded = encoder(x, training=False)
        logits = decoder(encoded, training=False)
        len_logits = get_tensor_len(logits)
        preds = ctc_decode(logits, len_logits)
        trans = dataset.get_attrs('trans', uttids.numpy())
        batch_cer_dist, batch_cer_len, batch_res_len = batch_cer(
            preds.numpy(), trans)
        total_cer_dist += batch_cer_dist
        total_cer_len += batch_cer_len
        total_res_len += batch_res_len

        num_processed += len(x)

    cer = total_cer_dist / total_cer_len
    print('dev PER: {:.3f}\t{} / {}'.format(cer, num_processed, dev_size))

    return cer
Esempio n. 3
0
def Transformer(args):
    num_layers = args.model.G.num_layers
    d_model = args.model.G.d_model
    num_heads = args.model.G.num_heads
    dff = 4 * d_model
    rate = args.model.G.dropout_rate
    dim_output = args.dim_output

    input_x = Input(shape=[None, args.dim_input], name='encoder_input')
    input_decoder = Input(shape=[None], name='decoder_input')
    cache = Input(shape=[None, num_layers, d_model], name='cache')

    # create encoder and connect
    encoded = Encoder(num_layers, d_model, num_heads, dff, rate)(input_x)

    # create two decoders: one for training and one for forward
    decoder = Decoder(num_layers, d_model, num_heads, dff, dim_output, rate)
    decoded = decoder(input_decoder, encoded, cache=None)
    _decoded, cache_decoder = decoder(input_decoder, encoded, cache)

    fc = Dense(dim_output)
    logits = fc(decoded)
    _logits = fc(_decoded)

    len_seq = get_tensor_len(decoded)
    pad_mask = tf.tile(tf.expand_dims(tf.sequence_mask(len_seq, dtype=tf.float32), -1),
                       [1, 1, dim_output])
    logits *= pad_mask
    _logits *= pad_mask

    model = tf.keras.Model([input_x, input_decoder], logits, name='transformer')
    model_infer = tf.keras.Model([input_x, input_decoder, cache], [_logits, cache], name='transformer_cache')

    return model, model_infer
def Res_Conv(args):
    num_hidden = args.model.G.encoder.num_hidden

    input_x = Input(shape=[None, args.dim_input], name='encoder_input')
    len_seq = get_tensor_len(input_x)
    x = Dense(num_hidden,
              use_bias=False,
              activation='linear',
              name="encoder/fc_1")(input_x)

    for i in range(3):
        inputs = x
        x = Conv1D(dim_output=num_hidden, kernel_size=5)(x)
        # x = tf.keras.layers.LayerNormalization()(x)
        x = ReLU()(x)
        x = Conv1D(dim_output=num_hidden, kernel_size=5)(x)
        # x = tf.keras.layers.LayerNormalization()(x)
        x = ReLU()(x)
        x = inputs + (0.3 * x)
        x = MaxPool1D(pool_size=2, padding='SAME')(x)
        len_seq = tf.cast(tf.math.ceil(tf.cast(len_seq, tf.float32) / 2),
                          tf.int32)

    encoded = x
    pad_mask = tf.tile(
        tf.expand_dims(tf.sequence_mask(len_seq, dtype=tf.float32), -1),
        [1, 1, num_hidden])
    encoded *= pad_mask

    encoder = tf.keras.Model(input_x, encoded, name='encoder')

    return encoder
Esempio n. 5
0
def train_G(x, _x, _y, G, D, optimizer_G, lambda_supervise, len_D):
    params_G = G.trainable_variables
    with tf.GradientTape(watch_accessed_variables=False) as tape_G:
        tape_G.watch(params_G)

        # supervise
        _logits = G(_x, training=True)
        loss_G_supervise = ctc_loss(
            _logits, get_tensor_len(_logits), _y,
            tf.reduce_sum(tf.cast(_y > 0, tf.int32), -1))
        loss_G_supervise = tf.reduce_mean(loss_G_supervise)
        # loss_G = loss_G_supervise
        # unsupervise
        logits = G(x, training=True)
        logits_shrunk = pad_to(ctc_shrink(logits), len_D)[:, :len_D, :]
        P_G = tf.nn.softmax(logits_shrunk)
        disc_fake = D(P_G, training=False)

        loss_G = lambda_supervise * loss_G_supervise - tf.reduce_mean(
            disc_fake)

    gradients_G = tape_G.gradient(loss_G, params_G)
    optimizer_G.apply_gradients(zip(gradients_G, params_G))

    return loss_G, loss_G_supervise
def Conv_LSTM(args):
    num_hidden = args.model.G.encoder.num_hidden
    num_filters = args.model.G.encoder.num_filters
    size_feat = args.dim_input

    input_x = Input(shape=[None, args.dim_input], name='encoder_input')
    size_length = tf.shape(input_x)[1]
    size_feat = int(size_feat / 3)
    len_feats = get_tensor_len(input_x)
    x = tf.reshape(input_x, [-1, size_length, size_feat, 3])
    # the first cnn layer
    x = normal_conv(x=x,
                    filter_num=num_filters,
                    kernel=(3, 3),
                    stride=(2, 2),
                    padding='SAME')
    # x = normal_conv(
    #     x=x,
    #     filter_num=num_filters,
    #     kernel=(3,3),
    #     stride=(1,1),
    #     padding='SAME')
    gates = Conv2D(4 * num_filters, (3, 3),
                   padding="SAME",
                   dilation_rate=(1, 1))(x)
    g = tf.split(LayerNormalization()(gates), 4, axis=3)
    new_cell = tf.math.sigmoid(g[0]) * x + tf.math.sigmoid(
        g[1]) * tf.math.tanh(g[3])
    x = tf.math.sigmoid(g[2]) * tf.math.tanh(new_cell)

    size_feat = int(np.ceil(size_feat / 2)) * num_filters
    size_length = tf.cast(tf.math.ceil(tf.cast(size_length, tf.float32) / 2),
                          tf.int32)
    len_seq = tf.cast(tf.math.ceil(tf.cast(len_feats, tf.float32) / 2),
                      tf.int32)
    x = tf.reshape(x, [-1, size_length, size_feat])

    x = Bidirectional(LSTM(num_hidden // 2, return_sequences=True))(x)
    x, len_seq = pooling(x, len_seq, num_hidden, 'HALF')

    x = Bidirectional(LSTM(num_hidden // 2, return_sequences=True))(x)
    x, len_seq = pooling(x, len_seq, num_hidden, 'SAME')

    x = Bidirectional(LSTM(num_hidden // 2, return_sequences=True))(x)
    x, len_seq = pooling(x, len_seq, num_hidden, 'HALF')

    x = Bidirectional(LSTM(num_hidden // 2, return_sequences=True))(x)
    x, len_seq = pooling(x, len_seq, num_hidden, 'SAME')

    encoded = x
    pad_mask = tf.tile(
        tf.expand_dims(tf.sequence_mask(len_seq, dtype=tf.float32), -1),
        [1, 1, num_hidden])
    encoded *= pad_mask

    encoder = tf.keras.Model(input_x, encoded, name='encoder')

    return encoder
Esempio n. 7
0
def monitor(sample, encoder, decoder):
    x = np.array([sample['feature']], dtype=np.float32)
    encoded = encoder(x)
    logits = decoder(encoded)
    len_logits = get_tensor_len(logits)
    preds = ctc_decode(logits, len_logits)

    print('predicts: \n', preds.numpy()[0])
    print('align: \n', sample['align'])
    print('trans: \n', sample['trans'])
def CE_loss(logits, labels):

    _loss = tf.losses.sparse_categorical_crossentropy(y_true=labels,
                                                      y_pred=logits,
                                                      from_logits=True)
    len_logits = get_tensor_len(logits)
    gen_loss = tf.sequence_mask(len_logits, dtype=tf.float32) * _loss
    # loss = tf.reduce_mean(tf.reduce_sum(gen_loss, -1) / tf.cast(len_logits, tf.float32))
    loss = tf.reduce_sum(gen_loss) / tf.cast(tf.reduce_sum(len_logits),
                                             tf.float32)

    return loss
Esempio n. 9
0
def train_CTC_G(x, trans, G, D, optimizer_G):
    params_G = G.trainable_variables
    with tf.GradientTape(watch_accessed_variables=False) as tape_G:
        tape_G.watch(params_G)

        logits = G(x, training=True)
        loss_G_ctc = ctc_loss(logits, get_tensor_len(logits), trans,
                              tf.reduce_sum(tf.cast(trans > 0, tf.int32), -1))
        loss_G_ctc = tf.reduce_mean(loss_G_ctc)

    gradients_G = tape_G.gradient(loss_G_ctc, params_G)
    optimizer_G.apply_gradients(zip(gradients_G, params_G))

    return loss_G_ctc
Esempio n. 10
0
def train_CTC_supervised(x, labels, encoder, decoder, optimizer):
    vars = encoder.trainable_variables + decoder.trainable_variables
    with tf.GradientTape() as tape:
        encoded = encoder(x, training=True)
        logits = decoder(encoded, training=True)
        len_logits = get_tensor_len(logits)
        len_labels = tf.reduce_sum(tf.cast(labels > 0, tf.int32), -1)
        loss = ctc_loss(logits, len_logits, labels, len_labels)
        loss = tf.reduce_mean(loss)

    gradients = tape.gradient(loss, vars)
    optimizer.apply_gradients(zip(gradients, vars))

    return loss
def Fully_Connected(args):
    dim_output = args.dim_output
    dim_input = args.model.G.encoder.num_hidden

    encoded = Input(shape=[None, dim_input], name='encoded')
    len_seq = get_tensor_len(encoded)

    logits = Dense(dim_output, name="decoder/fc")(encoded)
    pad_mask = tf.tile(
        tf.expand_dims(tf.sequence_mask(len_seq, dtype=tf.float32), -1),
        [1, 1, dim_output])
    logits *= pad_mask

    decoder = tf.keras.Model(encoded, logits, name='decoder')

    return decoder
Esempio n. 12
0
    def call(self, dec_input, enc_output, training):
        len_encoded = get_tensor_len(enc_output)
        encoder_padding = tf.equal(tf.sequence_mask(len_encoded),
                                   False)  # bool tensor
        padding_mask = attention_bias_ignore_padding(encoder_padding)
        look_ahead_mask = attention_bias_lower_triangle(tf.shape(dec_input)[1])

        x = self.embedding(dec_input)
        x = add_timing_signal_1d(x)
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.dec_layers[i](inputs=x,
                                   enc_output=enc_output,
                                   training=training,
                                   look_ahead_mask=look_ahead_mask,
                                   padding_mask=padding_mask)

        return x
Esempio n. 13
0
    def call(self, x, training):

        seq_len = tf.shape(x)[1]
        len_x = get_tensor_len(x)

        x = self.fc(x)
        x = self.layernorm(x)
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        encoder_padding = tf.equal(tf.sequence_mask(len_x, maxlen=seq_len), False) # bool tensor
        mask = attention_bias_ignore_padding(encoder_padding)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        x *= tf.expand_dims(1.0 - tf.cast(encoder_padding, tf.float32), axis=-1)

        return x  # (batch_size, input_seq_len, d_model)
def RNN_FC(args):
    dim_input = args.model.G.encoder.num_hidden
    dim_output = args.dim_output
    num_hidden = args.model.G.decoder.num_hidden
    cell_type = args.model.G.decoder.cell_type
    dropout = args.model.G.decoder.dropout

    encoded = Input(shape=[None, dim_input], name='encoded')
    len_seq = get_tensor_len(encoded)

    if cell_type == 'gru':
        x = GRU(num_hidden,
                return_sequences=True,
                dropout=dropout,
                name="decoder/gru")(encoded)
    elif cell_type == 'lstm':
        x = LSTM(num_hidden,
                 return_sequences=True,
                 dropout=dropout,
                 name="decoder/lstm")(encoded)
    elif cell_type == 'bgru':
        x = Bidirectional(
            GRU(int(num_hidden // 2),
                return_sequences=True,
                dropout=dropout,
                name="decoder/gru"))(encoded)
    elif cell_type == 'blstm':
        x = Bidirectional(
            LSTM(int(num_hidden // 2),
                 return_sequences=True,
                 dropout=dropout,
                 name="decoder/lstm"))(encoded)
    logits = Dense(dim_output, name="decoder/fc")(x)
    pad_mask = tf.tile(
        tf.expand_dims(tf.sequence_mask(len_seq, dtype=tf.float32), -1),
        [1, 1, dim_output])
    logits *= pad_mask

    decoder = tf.keras.Model(encoded, logits, name='decoder')

    return decoder
Esempio n. 15
0
def CE_loss(logits, labels, vocab_size, confidence=0.9):

    low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32)
    normalizing = -(confidence * tf.math.log(confidence) +
                    tf.cast(vocab_size - 1, tf.float32) * low_confidence *
                    tf.math.log(low_confidence + 1e-20))
    soft_targets = tf.one_hot(tf.cast(labels, tf.int32),
                              depth=vocab_size,
                              on_value=confidence,
                              off_value=low_confidence)

    xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                       labels=soft_targets)
    loss = xentropy - normalizing

    len_logits = get_tensor_len(logits)
    gen_loss = tf.sequence_mask(len_logits, dtype=tf.float32) * loss
    loss = tf.reduce_mean(
        tf.reduce_sum(gen_loss, -1) / tf.cast(len_logits, tf.float32))

    return loss
Esempio n. 16
0
    def call(self, x, training):

        len_x = get_tensor_len(x)

        x = self.fc(x)
        x = self.layernorm(x)
        x = add_timing_signal_1d(x)

        x = self.dropout(x, training=training)

        encoder_padding = tf.equal(tf.sequence_mask(len_x),
                                   False)  # bool tensor
        mask = attention_bias_ignore_padding(encoder_padding)

        for i in range(self.num_layers):
            x = self.enc_layers[i](inputs=x, training=training, mask=mask)

        x *= tf.expand_dims(1.0 - tf.cast(encoder_padding, tf.float32),
                            axis=-1)

        return x  # (batch_size, input_seq_len, d_model)