def call(self, x, enc_output, cache, training): seq_len = tf.shape(x)[1] len_encoded = get_tensor_len(enc_output) encoder_padding = tf.equal(tf.sequence_mask(len_encoded, maxlen=tf.shape(enc_output)[1]), False) # bool tensor padding_mask = attention_bias_ignore_padding(encoder_padding) look_ahead_mask = attention_bias_lower_triangle(tf.shape(x)[1]) new_cache = [] x = self.embedding(x) # (batch_size, target_seq_len, d_model) # x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) x += self.pos_encoding[:, :seq_len, :] x = self.dropout(x, training=training) for i in range(self.num_layers): x = self.dec_layers[i]( x, enc_output, training, look_ahead_mask, padding_mask) if cache is not None: decoder_output = tf.concat([cache[:, :, i, :], x], axis=1) new_cache.append(decoder_output[:, :, None, :]) # x.shape == (batch_size, target_seq_len, d_model) if cache is not None: new_cache = tf.concat(new_cache, axis=2) return x, new_cache else: return x
def evaluate(feature, dataset, dev_size, encoder, decoder): num_processed = 0 total_cer_dist = 0 total_cer_len = 0 total_res_len = 0 for batch in feature: uttids, x = batch # preds = forward(x, model) encoded = encoder(x, training=False) logits = decoder(encoded, training=False) len_logits = get_tensor_len(logits) preds = ctc_decode(logits, len_logits) trans = dataset.get_attrs('trans', uttids.numpy()) batch_cer_dist, batch_cer_len, batch_res_len = batch_cer( preds.numpy(), trans) total_cer_dist += batch_cer_dist total_cer_len += batch_cer_len total_res_len += batch_res_len num_processed += len(x) cer = total_cer_dist / total_cer_len print('dev PER: {:.3f}\t{} / {}'.format(cer, num_processed, dev_size)) return cer
def Transformer(args): num_layers = args.model.G.num_layers d_model = args.model.G.d_model num_heads = args.model.G.num_heads dff = 4 * d_model rate = args.model.G.dropout_rate dim_output = args.dim_output input_x = Input(shape=[None, args.dim_input], name='encoder_input') input_decoder = Input(shape=[None], name='decoder_input') cache = Input(shape=[None, num_layers, d_model], name='cache') # create encoder and connect encoded = Encoder(num_layers, d_model, num_heads, dff, rate)(input_x) # create two decoders: one for training and one for forward decoder = Decoder(num_layers, d_model, num_heads, dff, dim_output, rate) decoded = decoder(input_decoder, encoded, cache=None) _decoded, cache_decoder = decoder(input_decoder, encoded, cache) fc = Dense(dim_output) logits = fc(decoded) _logits = fc(_decoded) len_seq = get_tensor_len(decoded) pad_mask = tf.tile(tf.expand_dims(tf.sequence_mask(len_seq, dtype=tf.float32), -1), [1, 1, dim_output]) logits *= pad_mask _logits *= pad_mask model = tf.keras.Model([input_x, input_decoder], logits, name='transformer') model_infer = tf.keras.Model([input_x, input_decoder, cache], [_logits, cache], name='transformer_cache') return model, model_infer
def Res_Conv(args): num_hidden = args.model.G.encoder.num_hidden input_x = Input(shape=[None, args.dim_input], name='encoder_input') len_seq = get_tensor_len(input_x) x = Dense(num_hidden, use_bias=False, activation='linear', name="encoder/fc_1")(input_x) for i in range(3): inputs = x x = Conv1D(dim_output=num_hidden, kernel_size=5)(x) # x = tf.keras.layers.LayerNormalization()(x) x = ReLU()(x) x = Conv1D(dim_output=num_hidden, kernel_size=5)(x) # x = tf.keras.layers.LayerNormalization()(x) x = ReLU()(x) x = inputs + (0.3 * x) x = MaxPool1D(pool_size=2, padding='SAME')(x) len_seq = tf.cast(tf.math.ceil(tf.cast(len_seq, tf.float32) / 2), tf.int32) encoded = x pad_mask = tf.tile( tf.expand_dims(tf.sequence_mask(len_seq, dtype=tf.float32), -1), [1, 1, num_hidden]) encoded *= pad_mask encoder = tf.keras.Model(input_x, encoded, name='encoder') return encoder
def train_G(x, _x, _y, G, D, optimizer_G, lambda_supervise, len_D): params_G = G.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape_G: tape_G.watch(params_G) # supervise _logits = G(_x, training=True) loss_G_supervise = ctc_loss( _logits, get_tensor_len(_logits), _y, tf.reduce_sum(tf.cast(_y > 0, tf.int32), -1)) loss_G_supervise = tf.reduce_mean(loss_G_supervise) # loss_G = loss_G_supervise # unsupervise logits = G(x, training=True) logits_shrunk = pad_to(ctc_shrink(logits), len_D)[:, :len_D, :] P_G = tf.nn.softmax(logits_shrunk) disc_fake = D(P_G, training=False) loss_G = lambda_supervise * loss_G_supervise - tf.reduce_mean( disc_fake) gradients_G = tape_G.gradient(loss_G, params_G) optimizer_G.apply_gradients(zip(gradients_G, params_G)) return loss_G, loss_G_supervise
def Conv_LSTM(args): num_hidden = args.model.G.encoder.num_hidden num_filters = args.model.G.encoder.num_filters size_feat = args.dim_input input_x = Input(shape=[None, args.dim_input], name='encoder_input') size_length = tf.shape(input_x)[1] size_feat = int(size_feat / 3) len_feats = get_tensor_len(input_x) x = tf.reshape(input_x, [-1, size_length, size_feat, 3]) # the first cnn layer x = normal_conv(x=x, filter_num=num_filters, kernel=(3, 3), stride=(2, 2), padding='SAME') # x = normal_conv( # x=x, # filter_num=num_filters, # kernel=(3,3), # stride=(1,1), # padding='SAME') gates = Conv2D(4 * num_filters, (3, 3), padding="SAME", dilation_rate=(1, 1))(x) g = tf.split(LayerNormalization()(gates), 4, axis=3) new_cell = tf.math.sigmoid(g[0]) * x + tf.math.sigmoid( g[1]) * tf.math.tanh(g[3]) x = tf.math.sigmoid(g[2]) * tf.math.tanh(new_cell) size_feat = int(np.ceil(size_feat / 2)) * num_filters size_length = tf.cast(tf.math.ceil(tf.cast(size_length, tf.float32) / 2), tf.int32) len_seq = tf.cast(tf.math.ceil(tf.cast(len_feats, tf.float32) / 2), tf.int32) x = tf.reshape(x, [-1, size_length, size_feat]) x = Bidirectional(LSTM(num_hidden // 2, return_sequences=True))(x) x, len_seq = pooling(x, len_seq, num_hidden, 'HALF') x = Bidirectional(LSTM(num_hidden // 2, return_sequences=True))(x) x, len_seq = pooling(x, len_seq, num_hidden, 'SAME') x = Bidirectional(LSTM(num_hidden // 2, return_sequences=True))(x) x, len_seq = pooling(x, len_seq, num_hidden, 'HALF') x = Bidirectional(LSTM(num_hidden // 2, return_sequences=True))(x) x, len_seq = pooling(x, len_seq, num_hidden, 'SAME') encoded = x pad_mask = tf.tile( tf.expand_dims(tf.sequence_mask(len_seq, dtype=tf.float32), -1), [1, 1, num_hidden]) encoded *= pad_mask encoder = tf.keras.Model(input_x, encoded, name='encoder') return encoder
def monitor(sample, encoder, decoder): x = np.array([sample['feature']], dtype=np.float32) encoded = encoder(x) logits = decoder(encoded) len_logits = get_tensor_len(logits) preds = ctc_decode(logits, len_logits) print('predicts: \n', preds.numpy()[0]) print('align: \n', sample['align']) print('trans: \n', sample['trans'])
def CE_loss(logits, labels): _loss = tf.losses.sparse_categorical_crossentropy(y_true=labels, y_pred=logits, from_logits=True) len_logits = get_tensor_len(logits) gen_loss = tf.sequence_mask(len_logits, dtype=tf.float32) * _loss # loss = tf.reduce_mean(tf.reduce_sum(gen_loss, -1) / tf.cast(len_logits, tf.float32)) loss = tf.reduce_sum(gen_loss) / tf.cast(tf.reduce_sum(len_logits), tf.float32) return loss
def train_CTC_G(x, trans, G, D, optimizer_G): params_G = G.trainable_variables with tf.GradientTape(watch_accessed_variables=False) as tape_G: tape_G.watch(params_G) logits = G(x, training=True) loss_G_ctc = ctc_loss(logits, get_tensor_len(logits), trans, tf.reduce_sum(tf.cast(trans > 0, tf.int32), -1)) loss_G_ctc = tf.reduce_mean(loss_G_ctc) gradients_G = tape_G.gradient(loss_G_ctc, params_G) optimizer_G.apply_gradients(zip(gradients_G, params_G)) return loss_G_ctc
def train_CTC_supervised(x, labels, encoder, decoder, optimizer): vars = encoder.trainable_variables + decoder.trainable_variables with tf.GradientTape() as tape: encoded = encoder(x, training=True) logits = decoder(encoded, training=True) len_logits = get_tensor_len(logits) len_labels = tf.reduce_sum(tf.cast(labels > 0, tf.int32), -1) loss = ctc_loss(logits, len_logits, labels, len_labels) loss = tf.reduce_mean(loss) gradients = tape.gradient(loss, vars) optimizer.apply_gradients(zip(gradients, vars)) return loss
def Fully_Connected(args): dim_output = args.dim_output dim_input = args.model.G.encoder.num_hidden encoded = Input(shape=[None, dim_input], name='encoded') len_seq = get_tensor_len(encoded) logits = Dense(dim_output, name="decoder/fc")(encoded) pad_mask = tf.tile( tf.expand_dims(tf.sequence_mask(len_seq, dtype=tf.float32), -1), [1, 1, dim_output]) logits *= pad_mask decoder = tf.keras.Model(encoded, logits, name='decoder') return decoder
def call(self, dec_input, enc_output, training): len_encoded = get_tensor_len(enc_output) encoder_padding = tf.equal(tf.sequence_mask(len_encoded), False) # bool tensor padding_mask = attention_bias_ignore_padding(encoder_padding) look_ahead_mask = attention_bias_lower_triangle(tf.shape(dec_input)[1]) x = self.embedding(dec_input) x = add_timing_signal_1d(x) x = self.dropout(x, training=training) for i in range(self.num_layers): x = self.dec_layers[i](inputs=x, enc_output=enc_output, training=training, look_ahead_mask=look_ahead_mask, padding_mask=padding_mask) return x
def call(self, x, training): seq_len = tf.shape(x)[1] len_x = get_tensor_len(x) x = self.fc(x) x = self.layernorm(x) x += self.pos_encoding[:, :seq_len, :] x = self.dropout(x, training=training) encoder_padding = tf.equal(tf.sequence_mask(len_x, maxlen=seq_len), False) # bool tensor mask = attention_bias_ignore_padding(encoder_padding) for i in range(self.num_layers): x = self.enc_layers[i](x, training, mask) x *= tf.expand_dims(1.0 - tf.cast(encoder_padding, tf.float32), axis=-1) return x # (batch_size, input_seq_len, d_model)
def RNN_FC(args): dim_input = args.model.G.encoder.num_hidden dim_output = args.dim_output num_hidden = args.model.G.decoder.num_hidden cell_type = args.model.G.decoder.cell_type dropout = args.model.G.decoder.dropout encoded = Input(shape=[None, dim_input], name='encoded') len_seq = get_tensor_len(encoded) if cell_type == 'gru': x = GRU(num_hidden, return_sequences=True, dropout=dropout, name="decoder/gru")(encoded) elif cell_type == 'lstm': x = LSTM(num_hidden, return_sequences=True, dropout=dropout, name="decoder/lstm")(encoded) elif cell_type == 'bgru': x = Bidirectional( GRU(int(num_hidden // 2), return_sequences=True, dropout=dropout, name="decoder/gru"))(encoded) elif cell_type == 'blstm': x = Bidirectional( LSTM(int(num_hidden // 2), return_sequences=True, dropout=dropout, name="decoder/lstm"))(encoded) logits = Dense(dim_output, name="decoder/fc")(x) pad_mask = tf.tile( tf.expand_dims(tf.sequence_mask(len_seq, dtype=tf.float32), -1), [1, 1, dim_output]) logits *= pad_mask decoder = tf.keras.Model(encoded, logits, name='decoder') return decoder
def CE_loss(logits, labels, vocab_size, confidence=0.9): low_confidence = (1.0 - confidence) / tf.cast(vocab_size - 1, tf.float32) normalizing = -(confidence * tf.math.log(confidence) + tf.cast(vocab_size - 1, tf.float32) * low_confidence * tf.math.log(low_confidence + 1e-20)) soft_targets = tf.one_hot(tf.cast(labels, tf.int32), depth=vocab_size, on_value=confidence, off_value=low_confidence) xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=soft_targets) loss = xentropy - normalizing len_logits = get_tensor_len(logits) gen_loss = tf.sequence_mask(len_logits, dtype=tf.float32) * loss loss = tf.reduce_mean( tf.reduce_sum(gen_loss, -1) / tf.cast(len_logits, tf.float32)) return loss
def call(self, x, training): len_x = get_tensor_len(x) x = self.fc(x) x = self.layernorm(x) x = add_timing_signal_1d(x) x = self.dropout(x, training=training) encoder_padding = tf.equal(tf.sequence_mask(len_x), False) # bool tensor mask = attention_bias_ignore_padding(encoder_padding) for i in range(self.num_layers): x = self.enc_layers[i](inputs=x, training=training, mask=mask) x *= tf.expand_dims(1.0 - tf.cast(encoder_padding, tf.float32), axis=-1) return x # (batch_size, input_seq_len, d_model)