def _streaming_internal_state(self, inputs): outputs = super(Conv1DTranspose, self).call(inputs) if self.overlap == 0: if self.crop_output: return tf.identity(outputs[:, 0:self.output_time_dim, :]) else: return tf.identity(outputs) output_shape = outputs.shape.as_list() # need to add remainder state to a specific region of output as below: # outputs[:,0:self.overlap,:] = outputs[:,0:self.overlap,:] + self.states # but 'Tensor' object does not support item assignment, # so doing it through full summation below output_shape[1] -= self.state_shape[1] padded_remainder = tf.concat( [self.states, tf.zeros(output_shape, tf.float32)], 1) outputs = outputs + padded_remainder new_state = outputs[:, -self.overlap:, :] assign_states = self.states.assign(new_state) with tf.control_dependencies([assign_states]): if self.crop_output: return tf.identity(outputs[:, 0:self.output_time_dim, :]) else: return tf.identity(outputs)
def _streaming_internal_state(self, inputs): outputs = super(Conv1DTranspose, self).call(inputs) if self.overlap == 0: if self.crop_output: return tf.identity(outputs[:, 0:self.output_time_dim, :]) else: return tf.identity(outputs) output_shape = outputs.shape.as_list() # need to add remainder state to a specific region of output as below: # outputs[:,0:self.overlap,:] = outputs[:,0:self.overlap,:] + self.states # but 'Tensor' object does not support item assignment, # so doing it through full summation below output_shape[1] -= self.state_shape[1] padded_remainder = tf.concat( [self.states, tf.zeros(output_shape, tf.float32)], 1) outputs = outputs + padded_remainder # extract remainder state and substruct bias if it is used: # bias will be added in the next iteration again and remainder # should have only convolution part, so that bias is not added twice if self.use_bias: new_state = outputs[:, -self.overlap:, :] - self.bias else: new_state = outputs[:, -self.overlap:, :] assign_states = self.states.assign(new_state) with tf.control_dependencies([assign_states]): if self.crop_output: return tf.identity(outputs[:, 0:self.output_time_dim, :]) else: return tf.identity(outputs)
def _streaming_external_state(self, inputs, state): state = [] if state is None else state # compute inversed FT of any number of input frames inversed_frame = tf.signal.inverse_stft(inputs, self.frame_size, self.frame_step, self.fft_size, window_fn=self.window_fn) inversed_frame = tf.cast(inversed_frame, tf.float32) # if there is no overlap between frames then # there is no need in streaming state processing if self.frame_size - self.frame_step <= 0: return inversed_frame, state if self.use_one_step: # streaming with input frame by frame # update frame state new_frame_state = state + inversed_frame[:, 0:self.frame_size] # get output hop before frame shifting inversed_frames = new_frame_state[:, 0:self.frame_step] # shift frame samples by frame_step to the left: ring buffer new_frame_state = tf.concat( [new_frame_state, tf.zeros([1, self.frame_step])], axis=1) new_frame_state = new_frame_state[:, -self.frame_size:] else: # streaming with several input frames previous_state = state + inversed_frame[:, 0:self.frame_size] new_frame_state = tf.concat( [previous_state, inversed_frame[:, self.frame_size:]], axis=1) # get output hops before frame shifting inversed_frames = new_frame_state[:, 0:self.frame_step * self.input_frames] # shift frame samples by frame_step to the left: ring buffer new_frame_state = tf.concat( [new_frame_state, tf.zeros([1, self.frame_step])], axis=1) new_frame_state = new_frame_state[:, -self.frame_size:] return inversed_frames, new_frame_state
def _streaming_external_state(self, inputs, state): state = [] if state is None else state if isinstance(self.get_core_layer(), tf.keras.layers.Conv2DTranspose): outputs = self.cell(inputs) if self.ring_buffer_size_in_time_dim == 0: if self.transposed_conv_crop_output: outputs = outputs[:, 0:self.output_time_dim, :] return outputs, [] output_shape = outputs.shape.as_list() output_shape[1] -= self.state_shape[1] padded_remainder = tf.concat( [state, tf.zeros(output_shape, tf.float32)], 1) outputs = outputs + padded_remainder if self.get_core_layer().get_config()['use_bias']: # need to access bias of the cell layer, # where cell can be wrapped by wrapper layer bias = self.get_core_layer().bias new_state = outputs[:, -self. ring_buffer_size_in_time_dim:, :] - bias # pylint: disable=invalid-unary-operand-type else: new_state = outputs[:, -self.ring_buffer_size_in_time_dim:, :] # pylint: disable=invalid-unary-operand-type if self.transposed_conv_crop_output: outputs = outputs[:, 0:self.output_time_dim, :] return outputs, new_state else: if self.use_one_step: # The time dimenstion always has to equal 1 in streaming mode. if inputs.shape[1] != 1: raise ValueError('inputs.shape[1]: %d must be 1 ' % inputs.shape[1]) # remove latest row [batch_size, (memory_size-1), feature_dim, channel] memory = state[:, 1:self.ring_buffer_size_in_time_dim, :] # add new row [batch_size, memory_size, feature_dim, channel] memory = tf.keras.backend.concatenate([memory, inputs], 1) output = self.cell(memory) return output, memory else: # add new row [batch_size, memory_size, feature_dim, channel] memory = tf.keras.backend.concatenate([state, inputs], 1) state_update = memory[:, -self.ring_buffer_size_in_time_dim:, :] # pylint: disable=invalid-unary-operand-type output = self.cell(memory) return output, state_update
def _streaming_external_state(self, inputs, states): outputs = super(Conv1DTranspose, self).call(inputs) if self.overlap == 0: if self.crop_output: return outputs[:, 0:self.output_time_dim, :], [] else: return outputs, [] output_shape = outputs.shape.as_list() output_shape[1] -= self.state_shape[1] padded_remainder = tf.concat( [states, tf.zeros(output_shape, tf.float32)], 1) outputs = outputs + padded_remainder new_state = outputs[:, -self.overlap:, :] if self.crop_output: return outputs[:, 0:self.output_time_dim, :], new_state else: return outputs, new_state
def spectrogram_masking(spectrogram, dim=1, masks_number=2, mask_max_size=5): """Spectrogram masking on frequency or time dimension. Args: spectrogram: Input spectrum [batch, time, frequency] dim: dimension on which masking will be applied: 1 - time; 2 - frequency masks_number: number of masks mask_max_size: mask max size Returns: masked spectrogram """ if dim not in (1, 2): raise ValueError('Wrong dim value: %d' % dim) input_shape = spectrogram.shape time_size, frequency_size = input_shape[1:3] dim_size = input_shape[dim] # size of dimension on which mask is applied stripe_shape = [1, time_size, frequency_size] for _ in range(masks_number): mask_end = tf.random.uniform([], 0, mask_max_size, tf.int32) mask_start = tf.random.uniform([], 0, dim_size - mask_end, tf.int32) # initialize stripes with stripe_shape stripe_ones_left = list(stripe_shape) stripe_zeros_center = list(stripe_shape) stripe_ones_right = list(stripe_shape) # update stripes dim stripe_ones_left[dim] = dim_size - mask_start - mask_end stripe_zeros_center[dim] = mask_end stripe_ones_right[dim] = mask_start # generate mask mask = tf.concat(( tf.ones(stripe_ones_left, spectrogram.dtype), tf.zeros(stripe_zeros_center, spectrogram.dtype), tf.ones(stripe_ones_right, spectrogram.dtype), ), dim) spectrogram = spectrogram * mask return spectrogram
def call(self, inputs, training=None): # last dim is frame with features frame_axis = inputs.shape.rank - 1 # Makes general slice tuples. This would be equivalent to the [...] # slicing sugar, if we knew which axis we wanted. def make_framed_slice(start, stop): s = [slice(None)] * inputs.shape.rank s[frame_axis] = slice(start, stop) return tuple(s) # Slice containing the first frame element. slice_0 = make_framed_slice(0, 1) # Slice containing the rightmost frame_size-1 elements. slice_right = make_framed_slice(1, None) # Slice containing the leftmost frame_size-1 elements. slice_left = make_framed_slice(0, -1) preemphasized = tf.concat( (inputs[slice_0] * (1 - self.preemph), inputs[slice_right] - self.preemph * inputs[slice_left]), axis=frame_axis) return preemphasized
def _streaming_internal_state(self, inputs): if isinstance(self.get_core_layer(), tf.keras.layers.Conv2DTranspose): outputs = self.cell(inputs) if self.ring_buffer_size_in_time_dim == 0: if self.transposed_conv_crop_output: outputs = outputs[:, 0:self.output_time_dim] return outputs output_shape = outputs.shape.as_list() # need to add remainder state to a specific region of output as below: # outputs[:,0:self.ring_buffer_size_in_time_dim,:] = # outputs[:,0:self.ring_buffer_size_in_time_dim,:] + self.states # but 'Tensor' object does not support item assignment, # so doing it through full summation below output_shape[1] -= self.state_shape[1] padded_remainder = tf.concat( [self.states, tf.zeros(output_shape, tf.float32)], 1) outputs = outputs + padded_remainder # extract remainder state and subtract bias if it is used: # bias will be added in the next iteration again and remainder # should have only convolution part, so that bias is not added twice if self.get_core_layer().get_config()['use_bias']: # need to access bias of the cell layer, # where cell can be wrapped by wrapper layer bias = self.get_core_layer().bias new_state = outputs[:, -self.ring_buffer_size_in_time_dim:, :] - bias # pylint: disable=invalid-unary-operand-type else: new_state = outputs[:, -self.ring_buffer_size_in_time_dim:, :] # pylint: disable=invalid-unary-operand-type assign_states = self.states.assign(new_state) with tf.control_dependencies([assign_states]): if self.transposed_conv_crop_output: return tf.identity(outputs[:, 0:self.output_time_dim, :]) else: return tf.identity(outputs) else: if self.use_one_step: # The time dimenstion always has to equal 1 in streaming mode. if inputs.shape[1] != 1: raise ValueError('inputs.shape[1]: %d must be 1 ' % inputs.shape[1]) # remove latest row [batch_size, (memory_size-1), feature_dim, channel] memory = self.states[:, 1:self.ring_buffer_size_in_time_dim, :] # add new row [batch_size, memory_size, feature_dim, channel] memory = tf.keras.backend.concatenate([memory, inputs], 1) assign_states = self.states.assign(memory) with tf.control_dependencies([assign_states]): return self.cell(memory) else: # add new row [batch_size, memory_size, feature_dim, channel] if self.ring_buffer_size_in_time_dim: memory = tf.keras.backend.concatenate([self.states, inputs], 1) state_update = memory[:, -self.ring_buffer_size_in_time_dim:, :] # pylint: disable=invalid-unary-operand-type assign_states = self.states.assign(state_update) with tf.control_dependencies([assign_states]): return self.cell(memory) else: return self.cell(inputs)