def __init__(self, config, emb_layer, vocab_size, **kwargs): model_config = config['model']['net']['structure'] self.is_infer = config['model']['is_infer'] if self.is_infer: self.length_penalty = model_config['length_penalty'] self.dropout_rate = model_config['dropout_rate'] self.num_layers = model_config['num_layers'] self.l2_reg_lambda = model_config['l2_reg_lambda'] self.embedding_size = model_config['embedding_size'] self.max_enc_len = model_config['max_enc_len'] self.max_dec_len = model_config['max_dec_len'] self.share_embedding = model_config['share_embedding'] self.padding_token = 0 self.beam_size = model_config['beam_size'] self.mask_layer = tf.keras.layers.Lambda(lambda inputs: tf.cast( tf.not_equal(inputs, self.padding_token), tf.int32)) self.embed = emb_layer self.vocab_size = vocab_size self.embed_d = tf.keras.layers.Dropout(self.dropout_rate) self.pos_embed = PositionEmbedding(self.max_enc_len, self.embedding_size) self.transformer_decoders = [ TransformerDecoderLayer(config) for _ in range(self.num_layers) ] self.final_dense = tf.keras.layers.TimeDistributed( tf.keras.layers.Dense(self.vocab_size, name="final_dense")) super().__init__(**kwargs)
def __init__(self, config, **kwargs): super().__init__(config, **kwargs) tf.logging.info("Initialize TransformerModel...") model_config = config['model']['net']['structure'] self.is_infer = config['model']['is_infer'] if self.is_infer: self.length_penalty = model_config['length_penalty'] self.dropout_rate = model_config['dropout_rate'] self.num_layers = model_config['num_layers'] self.l2_reg_lambda = model_config['l2_reg_lambda'] self.max_enc_len = model_config['max_enc_len'] self.max_dec_len = model_config['max_dec_len'] self.share_embedding = model_config['share_embedding'] self.padding_token = utils.PAD_IDX self.beam_size = model_config['beam_size'] self.mask_layer = tf.keras.layers.Lambda(lambda inputs: tf.cast( tf.not_equal(inputs, self.padding_token), tf.int32)) self.embed_d = tf.keras.layers.Dropout(self.dropout_rate) self.pos_embed = layers.PositionEmbedding(self.max_enc_len, self.embedding_size) self.encoder = layers.TransformerEncoder(config) self.decoder = layers.TransformerDecoder(config, self.embed, self.decode_vocab_size) logging.info("decode_vocab_size: {}".format(self.decode_vocab_size)) logging.info("Initialize TransformerModel done.")
def compute_sen_lens(inputs, padding_token=0): """ Count how many words in a sentence. inputs: [..., time_steps] sen_lens: [...] """ x_binary = tf.cast(tf.not_equal(inputs, padding_token), tf.int32) sen_lens = tf.reduce_sum(x_binary, axis=-1) ones = tf.ones_like(sen_lens) sen_lens = tf.where(tf.equal(sen_lens, utils.PAD_IDX), x=ones, y=sen_lens) return sen_lens
def __init__(self, config, **kwargs): super().__init__(config, **kwargs) logging.info("Initialize RnnSeq2SeqModel...") model_config = config['model']['net']['structure'] self.is_infer = config['model']['is_infer'] self.dropout_rate = model_config['dropout_rate'] self.padding_token = utils.PAD_IDX self.embed_d = tf.keras.layers.Dropout(self.dropout_rate) self.encoder = layers.RnnEncoder(config, name="encoder") self.decoder = layers.RnnDecoder(config, self.decoder_embed, self.decode_vocab_size, name="decoder") self.mask_layer = tf.keras.layers.Lambda(lambda inputs: tf.cast( tf.not_equal(inputs, self.padding_token), tf.int32))
def transform_preprocess(labels=None, blank_index=None, num_class=None): ''' Ensure that the value of blank_index is in a reasonable range, and transform the DenseTensor labels to a SparseTensor ''' if blank_index is None or blank_index < 0: raise ValueError('blank_index must be greater than or equal to zero') if not num_class is None and blank_index > (num_class - 1): raise ValueError('blank_index must be less than or equal to num_class - 1') if labels is None: return None if not isinstance(labels, tf.SparseTensor): labels = tf.cast(labels, tf.int32) labels_idx = tf.where(tf.not_equal(labels, 0)) labels_values = tf.gather_nd(labels, labels_idx) labels_shape = tf.cast(tf.shape(labels), dtype=tf.int64) labels = tf.SparseTensor( indices=labels_idx, values=labels_values, dense_shape=labels_shape) return labels
def __init__(self, config, **kwargs): super().__init__(config, **kwargs) logging.info("Initialize TransformerModel...") self.vocab_size = config['data']['vocab_size'] self.num_classes = config['data']['task']['classes']['num_classes'] model_config = config['model']['net']['structure'] self.dropout_rate = model_config['dropout_rate'] self.embedding_size = model_config['embedding_size'] self.num_layers = model_config['num_layers'] self.l2_reg_lambda = model_config['l2_reg_lambda'] self.max_len = model_config['max_len'] self.transformer_dropout = model_config['transformer_dropout'] self.residual_conn = model_config['residual_conn'] self.head_num = model_config['head_num'] self.hidden_dim = model_config['hidden_dim'] self.padding_token = utils.PAD_IDX self.mask_layer = tf.keras.layers.Lambda(lambda inputs: tf.cast( tf.not_equal(inputs, self.padding_token), tf.int32)) self.embed = tf.keras.layers.Embedding( self.vocab_size, self.embedding_size, embeddings_initializer=self.embed_initializer) self.pos_embed = layers.PositionEmbedding(self.max_len, self.embedding_size) self.embed_d = tf.keras.layers.Dropout(self.dropout_rate) self.transformer_encoder = layers.TransformerEncoder(config) self.pool = tf.keras.layers.GlobalMaxPooling1D() self.final_dense = tf.keras.layers.Dense( self.num_classes, activation=tf.keras.activations.linear, name="final_dense") logging.info("Initialize TransformerModel done.")
def compute_mel_filterbank_features(waveforms, sample_rate=16000, dither=1.0 / np.iinfo(np.int16).max, preemphasis=0.97, frame_length=25, frame_step=10, fft_length=None, window_fn=functools.partial( tf.signal.hann_window, periodic=True), lower_edge_hertz=80.0, upper_edge_hertz=7600.0, num_mel_bins=80, log_noise_floor=1e-3, apply_mask=True): """Implement mel-filterbank extraction using tf ops. Args: waveforms: float32 tensor with shape [batch_size, max_len] sample_rate: sampling rate of the waveform dither: stddev of Gaussian noise added to waveform to prevent quantization artefacts preemphasis: waveform high-pass filtering constant frame_length: frame length in ms frame_step: frame_Step in ms fft_length: number of fft bins window_fn: windowing function lower_edge_hertz: lowest frequency of the filterbank upper_edge_hertz: highest frequency of the filterbank num_mel_bins: filterbank size log_noise_floor: clip small values to prevent numeric overflow in log apply_mask: When working on a batch of samples, set padding frames to zero Returns: filterbanks: a float32 tensor with shape [batch_size, len, num_bins, 1] """ # is a complex64 Tensor representing the short-time Fourier # Transform of each signal in . Its shape is # [batch_size, ?, fft_unique_bins] # where fft_unique_bins = fft_length // 2 + 1 # Find the wave length: the largest index for which the value is !=0 # note that waveforms samples that are exactly 0.0 are quite common, so # simply doing sum(waveforms != 0, axis=-1) will not work correctly. wav_lens = tf.reduce_max( tf.expand_dims(tf.range(tf.shape(waveforms)[1]), 0) * tf.to_int32(tf.not_equal(waveforms, 0.0)), axis=-1) + 1 if dither > 0: waveforms += tf.random_normal(tf.shape(waveforms), stddev=dither) if preemphasis > 0: waveforms = waveforms[:, 1:] - preemphasis * waveforms[:, :-1] wav_lens -= 1 frame_length = int(frame_length * sample_rate / 1e3) frame_step = int(frame_step * sample_rate / 1e3) if fft_length is None: fft_length = int(2**(np.ceil(np.log2(frame_length)))) stfts = tf.signal.stft(waveforms, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length, window_fn=window_fn, pad_end=True) stft_lens = (wav_lens + (frame_step - 1)) // frame_step masks = tf.to_float( tf.less_equal(tf.expand_dims(tf.range(tf.shape(stfts)[1]), 0), tf.expand_dims(stft_lens, 1))) # An energy spectrogram is the magnitude of the complex-valued STFT. # A float32 Tensor of shape [batch_size, ?, 257]. magnitude_spectrograms = tf.abs(stfts) # Warp the linear-scale, magnitude spectrograms into the mel-scale. num_spectrogram_bins = magnitude_spectrograms.shape[-1].value linear_to_mel_weight_matrix = (tf.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, sample_rate, lower_edge_hertz, upper_edge_hertz)) mel_spectrograms = tf.tensordot(magnitude_spectrograms, linear_to_mel_weight_matrix, 1) # Note: Shape inference for tensordot does not currently handle this case. mel_spectrograms.set_shape(magnitude_spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_sgram = tf.log(tf.maximum(log_noise_floor, mel_spectrograms)) if apply_mask: log_mel_sgram *= tf.expand_dims(tf.to_float(masks), -1) return tf.expand_dims(log_mel_sgram, -1, name="mel_sgrams")
def exclude_padding(self, batch): x_binary = tf.cast(tf.not_equal(batch, utils.PAD_IDX), tf.int32) sen_lens = tf.reduce_sum(x_binary, axis=-1) return batch[:sen_lens]