def call(self, inputs): if inputs.shape.rank < 2: raise ValueError('inputs.shape.rank: %d must be >= 2' % inputs.shape.rank) if self.mode in [ modes.Modes.STREAM_INTERNAL_STATE_INFERENCE, modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE ] or self.padding == 'valid' or self.padding_size == 0: # padding is not applied in streaming mode or on valid return inputs pad = [[0, 0]] * inputs.shape.rank if self.padding == 'causal': pad[1] = [self.padding_size, 0] elif self.padding == 'future': pad[1] = [0, self.padding_size] elif self.padding == 'same': half = (self.padding_size // 2 if self.padding_size >= 0 else (self.padding_size + 1) // 2) pad[1] = [half, self.padding_size - half] if self.padding_size >= 0: inputs = tf.pad(inputs, pad, 'constant') else: # Crop: crop_left = -pad[1][0] crop_right = -pad[1][1] if crop_right > 0: inputs = inputs[:, crop_left:-crop_right] else: inputs = inputs[:, crop_left:] return inputs
def _get_expected_output(self, dilation_rate=(1, 1), stacked=False): # Pad the front to match the padding of the streamed version dilated_kernel_size = dilation_rate[0] * (self.kernel_size[0] - 1) + 1 inputs_conv = np.pad(self.inputs, ((0, 0), (dilated_kernel_size - 1, 0), (0, 0), (0, 0)), 'constant') # Put through basic convolution layer layer = tf.keras.layers.Conv2D(self.filters, self.kernel_size, dilation_rate=dilation_rate, kernel_initializer='ones') inputs = tf.keras.layers.Input( shape=(self.time_dim + dilated_kernel_size - 1, self.feature_dim, 1), batch_size=self.batch_size) outputs = layer(inputs) # Stacking 2 convolutional layers on top of each other. if stacked: padded_outputs = tf.pad(outputs, ((0, 0), (dilated_kernel_size - 1, 0), (0, 0), (0, 0)), 'constant') layer = tf.keras.layers.Conv2D(self.filters, self.kernel_size, dilation_rate=dilation_rate, kernel_initializer='ones') outputs = layer(padded_outputs) model = tf.keras.Model(inputs, outputs) model_output = model.predict(inputs_conv) return model_output
def frequeny_pad(inputs, dilation, stride, kernel_size): """Pads input tensor in frequency domain. Args: inputs: input tensor dilation: dilation in frequency dim stride: stride in frequency dim kernel_size: kernel_size in frequency dim Returns: padded tensor Raises: ValueError: if any of input rank is < 3 """ # expected input: [N, Time, Frequency, ...] if inputs.shape.rank < 3: raise ValueError('input_shape.rank:%d must be at least 3' % inputs.shape.rank) kernel_size = (kernel_size - 1) * dilation + 1 total_pad = kernel_size - stride pad_left = total_pad // 2 pad_right = total_pad - pad_left pad = [[0, 0]] * inputs.shape.rank pad[2] = [pad_left, pad_right] return tf.pad(inputs, pad, 'constant')
def _non_streaming(self, inputs): # Zero pad inputs in time dime, from the left to make convolution causal. if self.pad_time_dim: if isinstance(self.cell, tf.keras.layers.Conv2D) or isinstance( self.cell, tf.keras.layers.DepthwiseConv2D): inputs = tf.pad(inputs, ((0, 0), (self.effective_ksize_tdim - 1, 0), (0, 0), (0, 0)), 'constant') return self.cell(inputs)
def random_shift(inputs, time_shift, seed=None): """Shifts input data randomly in time dim. It can be useful for augmenting training data with random shifts in time dim for making model more robust to input audio shifts Args: inputs: input tensor [batch_size, time] time_shift: defines time shift range: -time_shift...time_shift it is defiend in samples seed: random seed Returns: masked image Raises: ValueError: if inputs.shape.rank != 2 """ if inputs.shape.rank != 2: raise ValueError('inputs.shape.rank:%d must be 2' % inputs.shape.rank) inputs_shape = inputs.shape.as_list() batch_size = inputs_shape[0] sequence_length = inputs_shape[1] # below function will process 2D arrays, convert it to [batch, time, dummy] inputs = tf.expand_dims(inputs, 2) time_shift_amounts = tf.random.uniform(shape=[batch_size], minval=-time_shift, maxval=time_shift, dtype=tf.int32, seed=seed) outputs = tf.TensorArray(inputs.dtype, 0, dynamic_size=True) for i in tf.range(batch_size): time_shift_amount = time_shift_amounts[i] # pylint: disable=cell-var-from-loop time_shift_padding = tf.cond(time_shift_amount > 0, lambda: [[time_shift_amount, 0], [0, 0]], lambda: [[0, -time_shift_amount], [0, 0]]) time_shift_offset = tf.cond(time_shift_amount > 0, lambda: [0, 0], lambda: [-time_shift_amount, 0]) # pylint: enable=cell-var-from-loop padded = tf.pad(tensor=inputs[i], paddings=time_shift_padding, mode='CONSTANT') padded_sliced = tf.slice(padded, time_shift_offset, [sequence_length, -1]) outputs = outputs.write(i, padded_sliced) # convert it back to [batch, time] outputs = tf.squeeze(outputs.stack(), axis=[2]) outputs.set_shape(inputs_shape) return outputs
def _non_streaming(self, inputs): # Pad inputs in time dim: causal or same if self.pad_time_dim: if isinstance(self.cell, tf.keras.layers.Flatten): raise ValueError('pad_time_dim can not be used with Flatten') # temporal padding pad = [[0, 0]] * inputs.shape.rank if self.pad_time_dim == 'causal': pad[1] = [self.ring_buffer_size_in_time_dim - 1, 0] elif self.pad_time_dim == 'same': half = self.ring_buffer_size_in_time_dim // 2 pad[1] = [half, half] inputs = tf.pad(inputs, pad, 'constant') return self.cell(inputs)
def _non_streaming(self, inputs): # transposed conv is a special case if isinstance(self.get_core_layer(), tf.keras.layers.Conv2DTranspose): outputs = self.cell(inputs) # during training or non streaming inference, input shape can be dynamic self.output_time_dim = tf.shape(inputs)[1] * self.stride if self.transposed_conv_crop_output: if self.pad_time_dim == 'same': crop_left = self.ring_buffer_size_in_time_dim // 2 return outputs[:, crop_left:crop_left + self.output_time_dim, :] else: return outputs[:, 0:self.output_time_dim, :] else: return outputs else: # Pad inputs in time dim: causal or same if self.pad_time_dim: if isinstance(self.cell, (tf.keras.layers.Flatten, tf.keras.layers.GlobalMaxPooling2D, tf.keras.layers.GlobalAveragePooling2D)): raise ValueError( 'pad_time_dim can not be used with Flatten') # temporal padding pad = [[0, 0]] * inputs.shape.rank if self.use_one_step: pad_total_amount = self.ring_buffer_size_in_time_dim - 1 else: pad_total_amount = self.ring_buffer_size_in_time_dim if self.pad_time_dim == 'causal': pad[1] = [pad_total_amount, 0] elif self.pad_time_dim == 'same': half = pad_total_amount // 2 pad[1] = [half, pad_total_amount - half] inputs = tf.pad(inputs, pad, 'constant') return self.cell(inputs)
def call(self, inputs): if inputs.shape.rank < 2: raise ValueError('inputs.shape.rank: %d must be >= 2' % inputs.shape.rank) if self.mode in [ Modes.STREAM_INTERNAL_STATE_INFERENCE, Modes.STREAM_EXTERNAL_STATE_INFERENCE ] or self.padding == 'valid': # padding is not applied in streaming mode or on valid return inputs pad = [[0, 0]] * inputs.shape.rank if self.padding == 'causal': pad[1] = [self.padding_size, 0] elif self.padding == 'same': half = self.padding_size // 2 pad[1] = [half, half] inputs = tf.pad(inputs, pad, 'constant') return inputs
def _non_streaming(self, inputs): # Pad inputs in time dim: causal or same if self.pad_time_dim: if isinstance( self.cell, (tf.keras.layers.Flatten, tf.keras.layers.GlobalMaxPooling2D, tf.keras.layers.GlobalAveragePooling2D)): raise ValueError('pad_time_dim can not be used with Flatten') # temporal padding pad = [[0, 0]] * inputs.shape.rank if self.use_one_step: pad_total_amount = self.ring_buffer_size_in_time_dim - 1 else: pad_total_amount = self.ring_buffer_size_in_time_dim if self.pad_time_dim == 'causal': pad[1] = [pad_total_amount, 0] elif self.pad_time_dim == 'same': half = pad_total_amount // 2 pad[1] = [half, pad_total_amount - half] inputs = tf.pad(inputs, pad, 'constant') return self.cell(inputs)
def _non_streaming(self, inputs): if self.also_in_non_streaming: return tf.pad(inputs, ((0, 0), (self.delay, 0), (0, 0)))[:, :-self.delay, :] else: return inputs
def _non_streaming(self, inputs): if self.also_in_non_streaming: return tf.pad(inputs, ((0, 0), (self.delay, 0)) + ((0, 0), ) * (inputs.shape.rank - 2))[:, :-self.delay] else: return inputs
def random_cutout( inputs, mask_size, mask_value=0, seed=None, data_format='channels_last', ): """Applies cutout (https://arxiv.org/abs/1708.04552) to inputs. It is based on addons/tensorflow_addons/image/cutout_ops.py kept here here for backward compatibility Args: inputs: input tensor [batch_size, time, feature, channels] mask_size: mask size (time feature) mask_value: mask will be filled with this value seed: random seed data_format: dimesnions order Returns: masked image Raises: ValueError: if inputs.shape.rank != 4 """ if inputs.shape.rank != 4: raise ValueError('inputs.shape.rank:%d must be 4' % inputs.shape.rank) mask_size = tf.convert_to_tensor(mask_size) if tf.rank(mask_size) == 0: mask_size = tf.stack([mask_size, mask_size]) if data_format == 'channels_last': time_size, feature_size = tf.shape(inputs)[1], tf.shape(inputs)[2] else: time_size, feature_size = tf.shape(inputs)[2], tf.shape(inputs)[3] batch_size = tf.shape(inputs)[0] cutout_center_time = tf.random.uniform( shape=[batch_size], minval=0, maxval=time_size, dtype=tf.int32, seed=seed ) cutout_center_feature = tf.random.uniform( shape=[batch_size], minval=0, maxval=feature_size, dtype=tf.int32, seed=seed) offset = tf.transpose([cutout_center_time, cutout_center_feature], [1, 0]) origin_shape = inputs.shape offset = tf.convert_to_tensor(offset) mask_size = mask_size // 2 cutout_center_time = offset[:, 0] cutout_center_feature = offset[:, 1] lower_pads = tf.maximum(0, cutout_center_time - mask_size[0]) upper_pads = tf.maximum(0, time_size - cutout_center_time - mask_size[0]) left_pads = tf.maximum(0, cutout_center_feature - mask_size[1]) right_pads = tf.maximum(0, feature_size - cutout_center_feature - mask_size[1]) cutout_shape = tf.transpose( [ time_size - (lower_pads + upper_pads), feature_size - (left_pads + right_pads), ], [1, 0], ) masks = tf.TensorArray(inputs.dtype, 0, dynamic_size=True) for i in tf.range(tf.shape(cutout_shape)[0]): padding_dims = [ [lower_pads[i], upper_pads[i]], [left_pads[i], right_pads[i]], ] mask = tf.pad( tf.zeros(cutout_shape[i], dtype=inputs.dtype), padding_dims, constant_values=1, ) masks = masks.write(i, mask) if data_format == 'channels_last': mask = tf.expand_dims(masks.stack(), -1) mask = tf.tile(mask, [1, 1, 1, tf.shape(inputs)[-1]]) else: mask = tf.expand_dims(masks.stack(), 1) mask = tf.tile(mask, [1, tf.shape(inputs)[1], 1, 1]) inputs = tf.where( tf.equal(mask, 0), tf.ones_like(inputs, dtype=inputs.dtype) * mask_value, inputs, ) inputs.set_shape(origin_shape) return inputs