def __init__(self, dmodel=144, reduction_factor=4, num_blocks=16, head_size=36, num_heads=4, kernel_size=32, fc_factor=0.5, dropout=0.0, add_wav_info=False, hop_size=80, name="conformer_encoder", **kwargs): super(ConformerEncoder_, self).__init__(name=name, **kwargs) self.dmodel = dmodel self.reduction_factor = reduction_factor self.conv_subsampling = ConvSubsampling( odim=dmodel, reduction_factor=reduction_factor, dropout=dropout) self.conformer_blocks = [] self.add_wav_info = add_wav_info if self.add_wav_info: self.wav_layer = WavePickModel(dmodel, hop_size) for i in range(num_blocks): conformer_block = ConformerBlock(input_dim=dmodel, dropout=dropout, fc_factor=fc_factor, head_size=head_size, num_heads=num_heads, kernel_size=kernel_size, name=f"conformer_block_{i}") self.conformer_blocks.append(conformer_block)
def __init__(self, dmodel=144, reduction_factor=4, num_blocks=4, cell_nums=4, head_size=36, num_heads=4, kernel_size=32, fc_factor=0.5, dropout=0.0, add_wav_info=False, hop_size=80, name="streaming_conformer_encoder", **kwargs): """Initial variables.""" super(StreamingConformerEncoder, self).__init__() self.dmodel = dmodel self.reduction_factor = reduction_factor self.conv_subsampling = ConvSubsampling( odim=dmodel, reduction_factor=reduction_factor, dropout=dropout) self.dropout = dropout self.cell_nums = cell_nums self.add_wav_info = add_wav_info if self.add_wav_info: self.wav_layer = WavePickModel(dmodel, hop_size) cells = [] for i in range(cell_nums): cells.append( StreamingEncoderCell( dmodel=dmodel, num_blocks=num_blocks, head_size=head_size, num_heads=num_heads, kernel_size=kernel_size, fc_factor=fc_factor, dropout=dropout, name=name + 'cell_%s' % i, )) self.custom_layer = tf.keras.layers.RNN(cells, return_sequences=True, return_state=True, name='customer_rnn')
def __init__(self,arch_config,**kwargs): super(DeepSpeech2, self).__init__() conv_conf = append_default_keys_dict(DEFAULT_CONV, arch_config.get("conv_conf", {})) rnn_conf = append_default_keys_dict(DEFAULT_RNN, arch_config.get("rnn_conf", {})) fc_conf = append_default_keys_dict(DEFAULT_FC, arch_config.get("fc_conf", {})) assert len(conv_conf["conv_strides"]) == \ len(conv_conf["conv_filters"]) == len(conv_conf["conv_kernels"]) assert conv_conf["conv_type"] in [1, 2] assert rnn_conf["rnn_type"] in ["lstm", "gru", "rnn"] assert conv_conf["conv_dropout"] >= 0.0 and rnn_conf["rnn_dropout"] >= 0.0 layer=[] if conv_conf["conv_type"] == 2: conv = tf.keras.layers.Conv2D else: layer += [Merge2LastDims("conv1d_features")] conv = tf.keras.layers.Conv1D ker_shape = np.shape(conv_conf["conv_kernels"]) stride_shape = np.shape(conv_conf["conv_strides"]) filter_shape = np.shape(conv_conf["conv_filters"]) assert len(ker_shape) == 1 and len(stride_shape) == 1 and len(filter_shape) == 1 for i, fil in enumerate(conv_conf["conv_filters"]): layer += [conv(filters=fil, kernel_size=conv_conf["conv_kernels"][i], strides=conv_conf["conv_strides"][i], padding="same", activation=None, dtype=tf.float32, name=f"cnn_{i}")] layer += [tf.keras.layers.BatchNormalization(name=f"cnn_bn_{i}")] layer += [tf.keras.layers.ReLU(name=f"cnn_relu_{i}")] layer += [tf.keras.layers.Dropout(conv_conf["conv_dropout"], name=f"cnn_dropout_{i}")] last_dim=fil if conv_conf["conv_type"] == 2: layer += [Merge2LastDims("reshape_conv2d_to_rnn")] layer+=[tf.keras.layers.Dense(last_dim,name='feature_projector')] self.Cnn_feature_extractor=tf.keras.Sequential(layer) self.add_wav_info=kwargs['add_wav_info'] if kwargs['add_wav_info']: hop_size=kwargs['hop_size'] for i, fil in enumerate(conv_conf["conv_filters"]): hop_size*=conv_conf["conv_strides"][i][0] self.wav_layer=WavePickModel(last_dim,hop_size) layer=[] rnn = get_rnn(rnn_conf["rnn_type"]) # To time major if rnn_conf["rnn_bidirectional"]: layer += [TransposeTimeMajor("transpose_to_time_major")] # RNN layers for i in range(rnn_conf["rnn_layers"]): if rnn_conf["rnn_bidirectional"]: layer += [tf.keras.layers.Bidirectional( rnn(rnn_conf["rnn_units"], activation=rnn_conf["rnn_activation"], time_major=True, dropout=rnn_conf["rnn_dropout"], return_sequences=True, use_bias=True), name=f"b{rnn_conf['rnn_type']}_{i}")] layer += [SequenceBatchNorm(time_major=True, name=f"sequence_wise_bn_{i}")] else: layer += [rnn(rnn_conf["rnn_units"], activation=rnn_conf["rnn_activation"], dropout=rnn_conf["rnn_dropout"], return_sequences=True, use_bias=True, name=f"{rnn_conf['rnn_type']}_{i}")] layer += [SequenceBatchNorm(time_major=False, name=f"sequence_wise_bn_{i}")] if rnn_conf["rnn_rowconv"]: layer += [RowConv1D(filters=rnn_conf["rnn_units"], future_context=rnn_conf["rnn_rowconv_context"], name=f"row_conv_{i}")] # To batch major if rnn_conf["rnn_bidirectional"]: layer += [TransposeTimeMajor("transpose_to_batch_major")] # FC Layers if fc_conf["fc_units"]: assert fc_conf["fc_dropout"] >= 0.0 for idx, units in enumerate(fc_conf["fc_units"]): layer += [tf.keras.layers.Dense(units=units, activation=None, use_bias=True, name=f"hidden_fc_{idx}")] layer += [tf.keras.layers.BatchNormalization(name=f"hidden_fc_bn_{idx}")] layer += [tf.keras.layers.ReLU(name=f"hidden_fc_relu_{idx}")] layer += [tf.keras.layers.Dropout(fc_conf["fc_dropout"], name=f"hidden_fc_dropout_{idx}")] self.Rnn_feature_extractor=tf.keras.Sequential(layer)
class StreamingConformerEncoder(tf.keras.Model): def __init__(self, dmodel=144, reduction_factor=4, num_blocks=4, cell_nums=4, head_size=36, num_heads=4, kernel_size=32, fc_factor=0.5, dropout=0.0, add_wav_info=False, hop_size=80, name="streaming_conformer_encoder", **kwargs): """Initial variables.""" super(StreamingConformerEncoder, self).__init__() self.dmodel = dmodel self.reduction_factor = reduction_factor self.conv_subsampling = ConvSubsampling( odim=dmodel, reduction_factor=reduction_factor, dropout=dropout) self.dropout = dropout self.cell_nums = cell_nums self.add_wav_info = add_wav_info if self.add_wav_info: self.wav_layer = WavePickModel(dmodel, hop_size) cells = [] for i in range(cell_nums): cells.append( StreamingEncoderCell( dmodel=dmodel, num_blocks=num_blocks, head_size=head_size, num_heads=num_heads, kernel_size=kernel_size, fc_factor=fc_factor, dropout=dropout, name=name + 'cell_%s' % i, )) self.custom_layer = tf.keras.layers.RNN(cells, return_sequences=True, return_state=True, name='customer_rnn') @tf.function(experimental_relax_shapes=True) def call(self, inputs, states=None, training=None, mask=None): if self.add_wav_info: mel_inputs, wav_inputs = inputs B = tf.shape(mel_inputs)[0] T = tf.shape(mel_inputs)[1] mel_inputs = merge_two_first_dims(mel_inputs) wav_inputs = merge_two_first_dims(wav_inputs) mel_outputs = self.conv_subsampling(mel_inputs, training=training) wav_outputs = self.wav_layer(wav_inputs, training=training) outputs = mel_outputs + wav_outputs outputs = split_two_first_dims(outputs, B, T) else: mel_inputs = inputs B = tf.shape(mel_inputs)[0] T = tf.shape(mel_inputs)[1] mel_inputs = merge_two_first_dims(mel_inputs) outputs = self.conv_subsampling(mel_inputs, training=training) outputs = split_two_first_dims(outputs, B, T) if states is None: states = self.custom_layer.get_initial_state(outputs) outputs = self.custom_layer(outputs, initial_state=states) return outputs[0], outputs[1:] def get_init_states(self, inputs): return self.custom_layer.get_initial_state(inputs) def inference(self, inputs, states): if self.add_wav_info: mel_inputs, wav_inputs = inputs mel_outputs = self.conv_subsampling(mel_inputs, training=False) wav_outputs = self.wav_layer(wav_inputs, training=False) outputs = mel_outputs + wav_outputs else: mel_inputs = inputs outputs = self.conv_subsampling(mel_inputs, training=False) outputs = tf.expand_dims(outputs, 1) outputs = self.custom_layer(outputs, initial_state=states) new_states = outputs[1:] result = tf.squeeze(outputs[0], 1) return result, new_states def get_config(self): conf = super(StreamingConformerEncoder, self).get_config() conf.update(self.conv_subsampling.get_config()) if self.add_wav_info: conf.update(self.wav_layer.get_config()) conf.update(self.custom_layer.get_config())