def delay_model(flags, time_delay): """Model with delay for streaming mode. Args: flags: model and data settings time_delay: delay in time dim Returns: Keras model """ input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = input_audio net = tf.keras.backend.expand_dims(net) net = delay.Delay(delay=time_delay)(net) return tf.keras.Model(input_audio, net)
def residual_model(flags, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding, delay_also_in_non_streaming, dilation=1): """Toy deep convolutional model with residual connections. It can be used for speech enhancement. Args: flags: model and data settings cnn_filters: list of filters in conv layer cnn_kernel_size: list of kernel_size in conv layer cnn_act: list of activation functions in conv layer cnn_use_bias: list of use_bias in conv layer cnn_padding: list of padding in conv layer delay_also_in_non_streaming: Whether to apply delay also in non-streaming. dilation: dilation applied on all conv layers Returns: Keras model and sum delay Raises: ValueError: if any of input list has different length from any other or padding in not [same, causal] """ if not all( len(cnn_filters) == len(l) for l in [cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding]): raise ValueError('all input lists have to be the same length') # it is an example of deep conv model for speech enhancement # which can be trained in non streaming mode and converted to streaming mode input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = input_audio sum_delay = 0 net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, use_bias, padding in zip( cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding): time_buffer_size = dilation * (kernel_size - 1) if padding == 'causal': # residual connection is simple with 'causal' padding net_residual = net elif padding == 'same': # residual connection in streaming mode needs delay with 'same' padding delay_val = time_buffer_size // 2 net_residual = delay.Delay( delay=delay_val, also_in_non_streaming=delay_also_in_non_streaming)(net) sum_delay += delay_val else: raise ValueError('wrong padding mode ', padding) # it is easier to convert model to streaming mode when padding function # is decoupled from conv layer net = temporal_padding.TemporalPadding( padding='causal' if delay_also_in_non_streaming else padding, padding_size=time_buffer_size)(net) # it is a ring buffer in streaming mode and lambda x during training net = stream.Stream(cell=tf.identity, ring_buffer_size_in_time_dim=time_buffer_size, use_one_step=False, pad_time_dim=None)(net) net = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation=activation, use_bias=use_bias, padding='valid')( net) # padding has to be valid! net = tf.keras.layers.Add()([net, net_residual]) return tf.keras.Model(input_audio, net), sum_delay
def conv_model(flags, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding, dilation=1): """Toy convolutional model with sequence of convs with different paddings. It can be used for speech enhancement. Args: flags: model and data settings cnn_filters: list of filters in conv layer cnn_kernel_size: list of kernel_size in conv layer cnn_act: list of activation functions in conv layer cnn_use_bias: list of use_bias in conv layer cnn_padding: list of padding in conv layer dilation: dilation applied on all conv layers Returns: Keras model and sum delay Raises: ValueError: if any of input list has different length from any other or padding in not [same, causal] """ if not all( len(cnn_filters) == len(l) for l in [cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding]): raise ValueError('all input lists have to be the same length') # it is an example of deep conv model for speech enhancement # which can be trained in non streaming mode and converted to streaming mode input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = input_audio sum_delay = 0 sum_shift = 0 net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, use_bias, padding in zip( cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding): time_buffer_size = dilation * (kernel_size - 1) if padding == 'same': # need a delay with 'same' padding in streaming mode delay_val = time_buffer_size // 2 net = delay.Delay(delay=delay_val)(net) sum_delay += delay_val * 2 elif padding == 'causal': sum_shift += kernel_size else: raise ValueError('wrong padding mode ', padding) # it is a ring buffer in streaming mode and lambda x during training net = stream.Stream(cell=tf.keras.layers.Conv1D( filters=filters, kernel_size=kernel_size, activation=activation, use_bias=use_bias, padding='valid'), use_one_step=False, pad_time_dim=padding)(net) return tf.keras.Model(input_audio, net), sum_delay, sum_shift
def transposed_conv_model(flags, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_paddings, trans_paddings): """Toy deep convolutional model with transposed convolutions. It can be used for speech enhancement. Args: flags: model and data settings cnn_filters: list of filters for conv layer cnn_kernel_size: list of kernel_size for conv layer cnn_act: list of activation functions for conv layer cnn_use_bias: list of use_bias for conv layer cnn_paddings: list of padding for conv layer trans_paddings: list of padding for transposed conv layer Returns: Keras model and sum delay Raises: ValueError: if any of input list has different length from any other or padding in not [same, causal] """ if not all( len(cnn_filters) == len(l) for l in [ cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_paddings, trans_paddings ]): raise ValueError('all input lists have to be the same length') # it is an example of deep conv model for speech enhancement # which can be trained in non streaming mode and converted to streaming mode input_audio = tf.keras.layers.Input( shape=(flags.desired_samples,), batch_size=flags.batch_size) net = input_audio net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, use_bias, padding, trans_padding in zip( cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_paddings, trans_paddings): time_buffer_size = kernel_size - 1 net = tf.keras.backend.expand_dims(net, axis=-2) net = stream.Stream( cell=tf.keras.layers.Conv2DTranspose( filters=filters, kernel_size=(3, 1), strides=(2, 1), padding='valid'), pad_time_dim=trans_padding)(net) net = tf.keras.backend.squeeze(net, axis=-2) if padding == 'same': # model looking into future, so introducing delay for streaming mode net = delay.Delay(delay=time_buffer_size // 2)(net) elif padding != 'causal': raise ValueError('wrong padding mode ', padding) # it is a ring buffer in streaming mode and lambda x during training net = stream.Stream( cell=tf.keras.layers.Conv1D( filters=filters, kernel_size=kernel_size, activation=activation, use_bias=use_bias, padding='valid'), use_one_step=False, pad_time_dim=padding)(net) return tf.keras.Model(input_audio, net)