def _create_sinc_convs(self): blocks = OrderedDict() # SincConvBlock out_channels = 128 self.filters = SincConv( self.in_channels, out_channels, kernel_size=101, stride=1, fs=self.fs, window_func=self.windowing_type, scale_type=self.scale_type, ) block = OrderedDict([ ("Filters", self.filters), ("LogCompression", LogCompression()), ("BatchNorm", torch.nn.BatchNorm1d(out_channels, affine=True)), ("AvgPool", torch.nn.AvgPool1d(2)), ]) blocks["SincConvBlock"] = torch.nn.Sequential(block) in_channels = out_channels # First convolutional block, connects the sinc output to the front-end "body" out_channels = 128 blocks["DConvBlock1"] = self.gen_lsc_block( in_channels, out_channels, depthwise_kernel_size=25, depthwise_stride=2, pointwise_groups=0, avgpool=True, dropout_probability=0.1, ) in_channels = out_channels # Second convolutional block, multiple convolutional layers out_channels = self.out_channels for layer in [2, 3, 4]: blocks[f"DConvBlock{layer}"] = self.gen_lsc_block( in_channels, out_channels, depthwise_kernel_size=9, depthwise_stride=1) in_channels = out_channels # Third Convolutional block, acts as coupling to encoder out_channels = self.out_channels blocks["DConvBlock5"] = self.gen_lsc_block( in_channels, out_channels, depthwise_kernel_size=7, depthwise_stride=1, pointwise_groups=0, ) self.blocks = torch.nn.Sequential(blocks)
def test_sinc_filters(): filters = SincConv( in_channels=1, out_channels=128, kernel_size=101, stride=1, fs=16000 ) x = torch.randn([50, 1, 400], requires_grad=True) y = filters(x) assert y.shape == torch.Size([50, 128, 300]) # now test multichannel filters = SincConv( in_channels=2, out_channels=128, kernel_size=101, stride=1, fs=16000 ) x = torch.randn([50, 2, 400], requires_grad=True) y = filters(x) assert y.shape == torch.Size([50, 128, 300])
class LightweightSincConvs(AbsPreEncoder): """Lightweight Sinc Convolutions. Provide a frontend for raw audio input. https://arxiv.org/abs/2010.07597 """ def __init__( self, fs: Union[int, str, float] = 16000, in_channels: int = 1, out_channels: int = 256, activation_type: str = "leakyrelu", dropout_type: str = "dropout", windowing_type: str = "hamming", scale_type: str = "mel", ): """Initialize the module. Args: fs: Sample rate. in_channels: Number of input channels. out_channels: Number of output channels (for each input channel). activation_type: Choice of activation function. dropout_type: Choice of dropout function. windowing_type: Choice of windowing function. scale_type: Choice of filter-bank initialization scale. """ assert check_argument_types() super().__init__() if isinstance(fs, str): fs = humanfriendly.parse_size(fs) self.fs = fs self.in_channels = in_channels self.out_channels = out_channels self.activation_type = activation_type self.dropout_type = dropout_type self.windowing_type = windowing_type self.scale_type = scale_type self.choices_dropout = { "dropout": torch.nn.Dropout, "spatial": SpatialDropout, "dropout2d": torch.nn.Dropout2d, } if dropout_type not in self.choices_dropout: raise NotImplementedError( f"Dropout type has to be one of " f"{list(self.choices_dropout.keys())}", ) self.choices_activation = { "leakyrelu": torch.nn.LeakyReLU, "relu": torch.nn.ReLU, } if activation_type not in self.choices_activation: raise NotImplementedError( f"Activation type has to be one of " f"{list(self.choices_activation.keys())}", ) # initialization self._create_sinc_convs() # Sinc filters require custom initialization self.espnet_initialization_fn() def _create_sinc_convs(self): blocks = OrderedDict() # SincConvBlock out_channels = 128 self.filters = SincConv( self.in_channels, out_channels, kernel_size=101, stride=1, fs=self.fs, window_func=self.windowing_type, scale_type=self.scale_type, ) block = OrderedDict([ ("Filters", self.filters), ("LogCompression", LogCompression()), ("BatchNorm", torch.nn.BatchNorm1d(out_channels, affine=True)), ("AvgPool", torch.nn.AvgPool1d(2)), ]) blocks["SincConvBlock"] = torch.nn.Sequential(block) in_channels = out_channels # First convolutional block, connects the sinc output to the front-end "body" out_channels = 128 blocks["DConvBlock1"] = self.gen_lsc_block( in_channels, out_channels, depthwise_kernel_size=25, depthwise_stride=2, pointwise_groups=0, avgpool=True, dropout_probability=0.1, ) in_channels = out_channels # Second convolutional block, multiple convolutional layers out_channels = self.out_channels for layer in [2, 3, 4]: blocks[f"DConvBlock{layer}"] = self.gen_lsc_block( in_channels, out_channels, depthwise_kernel_size=9, depthwise_stride=1) in_channels = out_channels # Third Convolutional block, acts as coupling to encoder out_channels = self.out_channels blocks["DConvBlock5"] = self.gen_lsc_block( in_channels, out_channels, depthwise_kernel_size=7, depthwise_stride=1, pointwise_groups=0, ) self.blocks = torch.nn.Sequential(blocks) def gen_lsc_block( self, in_channels: int, out_channels: int, depthwise_kernel_size: int = 9, depthwise_stride: int = 1, depthwise_groups=None, pointwise_groups=0, dropout_probability: float = 0.15, avgpool=False, ): """Generate a block for lightweight Sinc convolutions. Args: in_channels: Number of input channels. out_channels: Number of output channels. depthwise_kernel_size: Kernel size of the depthwise convolution. depthwise_stride: Stride of the depthwise convolution. depthwise_groups: Number of groups of the depthwise convolution. pointwise_groups: Number of groups of the pointwise convolution. dropout_probability: Dropout probability in the block. avgpool: If True, an AvgPool layer is inserted. Returns: torch.nn.Sequential: Neural network building block. """ block = OrderedDict() if not depthwise_groups: # GCD(in_channels, out_channels) to prevent size mismatches depthwise_groups, r = in_channels, out_channels while r != 0: depthwise_groups, r = depthwise_groups, depthwise_groups % r block["depthwise"] = torch.nn.Conv1d( in_channels, out_channels, depthwise_kernel_size, depthwise_stride, groups=depthwise_groups, ) if pointwise_groups: block["pointwise"] = torch.nn.Conv1d(out_channels, out_channels, 1, 1, groups=pointwise_groups) block["activation"] = self.choices_activation[self.activation_type]() block["batchnorm"] = torch.nn.BatchNorm1d(out_channels, affine=True) if avgpool: block["avgpool"] = torch.nn.AvgPool1d(2) block["dropout"] = self.choices_dropout[self.dropout_type]( dropout_probability) return torch.nn.Sequential(block) def espnet_initialization_fn(self): """Initialize sinc filters with filterbank values.""" self.filters.init_filters() for block in self.blocks: for layer in block: if type(layer) == torch.nn.BatchNorm1d and layer.affine: layer.weight.data[:] = 1.0 layer.bias.data[:] = 0.0 def forward( self, input: torch.Tensor, input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """Forward function.""" # Transform input data: # (B, T, C_in, D_in) -> (B*T, C_in, D_in) B, T, C_in, D_in = input.size() input_frames = input.view(B * T, C_in, D_in) output_frames = self.blocks.forward(input_frames) # ---TRANSFORM: (B*T, C_out, D_out) -> (B, T, C_out*D_out) _, C_out, D_out = output_frames.size() output_frames = output_frames.view(B, T, C_out * D_out) return output_frames, input_lengths # no state in this layer def output_size(self) -> int: """Get the output size.""" return self.out_channels * self.in_channels
def plot_filter_kernels(filters: torch.Tensor, sample_rate: int, args): """Plot the Sinc filter kernels. Args: filters (torch.Tensor): Filter parameters. sample_rate (int): Sample rate of Signal. args (dict): Dictionary with output options. """ from espnet2.layers.sinc_conv import SincConv print("When plotting filter kernels, make sure the script has the" " correct SincConv settings (currently hard-coded).") convs = SincConv(1, 128, 101) # unlearned convs._create_filters(convs.f.device) pre_kernels = convs.sinc_filters.detach().numpy() pre_filters = convs.f.detach().numpy() f_mins = np.abs(pre_filters[:, 0]) f_maxs = np.abs( pre_filters[:, 0]) + np.abs(pre_filters[:, 1] - pre_filters[:, 0]) F_mins, F_maxs = f_mins * sample_rate, f_maxs * sample_rate pre_F_mins, pre_F_maxs = np.round(F_mins).astype( np.int), np.round(F_maxs).astype(np.int) # learned convs.f = torch.nn.Parameter(torch.Tensor(filters)) convs._create_filters(convs.f.device) kernels = convs.sinc_filters.detach().numpy() f_mins = np.abs(filters[:, 0]) f_maxs = np.abs(filters[:, 0]) + np.abs(filters[:, 1] - filters[:, 0]) F_mins, F_maxs = f_mins * sample_rate, f_maxs * sample_rate F_mins, F_maxs = np.round(F_mins).astype(np.int), np.round(F_maxs).astype( np.int) F_mins, F_maxs = np.clip(F_mins, 0, sample_rate / 2.0), np.clip( F_maxs, 0, sample_rate / 2.0) x_f = np.linspace(0.0, np.max(F_maxs), int(np.max(F_maxs)) + 1) x = np.arange(kernels.shape[2]) if args.all: for i in range(len(kernels)): pre_kernel = pre_kernels[i][0] plt.clf() plt.xticks([]) plt.yticks([]) plt.plot(x, pre_kernel) img_name = "filter_pre_kernel_%s.%s" % (str(i).zfill(2), args.filetype) img_path = str(args.out_folder / img_name) plt.savefig(img_path, bbox_inches="tight") print("Plotted %s" % img_path) kernel = kernels[i][0] plt.clf() plt.xticks([]) plt.yticks([]) plt.plot(x, kernel) img_name = "filter_kernel_%s.%s" % (str(i).zfill(2), args.filetype) img_path = str(args.out_folder / img_name) plt.savefig(img_path, bbox_inches="tight") print("Plotted %s" % img_path) plt.clf() plt.xlabel("kernel index") plt.plot(x, kernel) plt.plot(x, pre_kernel, "--", alpha=0.5) img_name = "filter_kernel_both_%s.%s" % (str(i).zfill(2), args.filetype) img_path = str(args.out_folder / img_name) plt.savefig(img_path, bbox_inches="tight") print("Plotted %s" % img_path) y = np.zeros_like(x_f) y[F_mins[i]:F_maxs[i]] = 1.0 plt.clf() plt.plot(x_f, y) img_name = "filter_freq_%s.%s" % (str(i).zfill(2), args.filetype) img_path = str(args.out_folder / img_name) plt.savefig(img_path, bbox_inches="tight") print("Plotted %s" % img_path) pre_y = np.zeros_like(x_f) pre_y[pre_F_mins[i]:pre_F_maxs[i]] = 1.0 plt.clf() plt.plot(x_f, y) plt.plot(x_f, pre_y) img_name = "filter_freq_both_%s.%s" % (str(i).zfill(2), args.filetype) img_path = args.out_folder / img_name plt.savefig(img_path, bbox_inches="tight") print("Plotted %s" % img_path) plt.clf() filters = [32, 71, 113, 126] fig, axs = plt.subplots(2, 2, sharex=True, sharey="row") axs[0, 0].plot(x, kernels[filters[0]][0]) axs[0, 0].plot(x, pre_kernels[filters[0]][0], "--", alpha=0.5) axs[0, 1].plot(x, kernels[filters[1]][0]) axs[0, 1].plot(x, pre_kernels[filters[1]][0], "--", alpha=0.5) axs[1, 0].plot(x, kernels[filters[2]][0]) axs[1, 0].plot(x, pre_kernels[filters[2]][0], "--", alpha=0.5) axs[1, 1].plot(x, kernels[filters[3]][0]) axs[1, 1].plot(x, pre_kernels[filters[3]][0], "--", alpha=0.5) img_name = "filter_kernel_ensemble2.%s" % (args.filetype) img_path = str(args.out_folder / img_name) plt.savefig(img_path, bbox_inches="tight") plt.close(fig) print("Plotted %s" % img_path)
class LightweightSincConvs(AbsPreEncoder): """Lightweight Sinc Convolutions. Instead of using precomputed features, end-to-end speech recognition can also be done directly from raw audio using sinc convolutions, as described in "Lightweight End-to-End Speech Recognition from Raw Audio Data Using Sinc-Convolutions" by Kürzinger et al. https://arxiv.org/abs/2010.07597 To use Sinc convolutions in your model instead of the default f-bank frontend, set this module as your pre-encoder with `preencoder: sinc` and use the input of the sliding window frontend with `frontend: sliding_window` in your yaml configuration file. So that the process flow is: Frontend (SlidingWindow) -> SpecAug -> Normalization -> Pre-encoder (LightweightSincConvs) -> Encoder -> Decoder Note that this method also performs data augmentation in time domain (vs. in spectral domain in the default frontend). Use `plot_sinc_filters.py` to visualize the learned Sinc filters. """ def __init__( self, fs: Union[int, str, float] = 16000, in_channels: int = 1, out_channels: int = 256, activation_type: str = "leakyrelu", dropout_type: str = "dropout", windowing_type: str = "hamming", scale_type: str = "mel", ): """Initialize the module. Args: fs: Sample rate. in_channels: Number of input channels. out_channels: Number of output channels (for each input channel). activation_type: Choice of activation function. dropout_type: Choice of dropout function. windowing_type: Choice of windowing function. scale_type: Choice of filter-bank initialization scale. """ assert check_argument_types() super().__init__() if isinstance(fs, str): fs = humanfriendly.parse_size(fs) self.fs = fs self.in_channels = in_channels self.out_channels = out_channels self.activation_type = activation_type self.dropout_type = dropout_type self.windowing_type = windowing_type self.scale_type = scale_type self.choices_dropout = { "dropout": torch.nn.Dropout, "spatial": SpatialDropout, "dropout2d": torch.nn.Dropout2d, } if dropout_type not in self.choices_dropout: raise NotImplementedError( f"Dropout type has to be one of " f"{list(self.choices_dropout.keys())}", ) self.choices_activation = { "leakyrelu": torch.nn.LeakyReLU, "relu": torch.nn.ReLU, } if activation_type not in self.choices_activation: raise NotImplementedError( f"Activation type has to be one of " f"{list(self.choices_activation.keys())}", ) # initialization self._create_sinc_convs() # Sinc filters require custom initialization self.espnet_initialization_fn() def _create_sinc_convs(self): blocks = OrderedDict() # SincConvBlock out_channels = 128 self.filters = SincConv( self.in_channels, out_channels, kernel_size=101, stride=1, fs=self.fs, window_func=self.windowing_type, scale_type=self.scale_type, ) block = OrderedDict([ ("Filters", self.filters), ("LogCompression", LogCompression()), ("BatchNorm", torch.nn.BatchNorm1d(out_channels, affine=True)), ("AvgPool", torch.nn.AvgPool1d(2)), ]) blocks["SincConvBlock"] = torch.nn.Sequential(block) in_channels = out_channels # First convolutional block, connects the sinc output to the front-end "body" out_channels = 128 blocks["DConvBlock1"] = self.gen_lsc_block( in_channels, out_channels, depthwise_kernel_size=25, depthwise_stride=2, pointwise_groups=0, avgpool=True, dropout_probability=0.1, ) in_channels = out_channels # Second convolutional block, multiple convolutional layers out_channels = self.out_channels for layer in [2, 3, 4]: blocks[f"DConvBlock{layer}"] = self.gen_lsc_block( in_channels, out_channels, depthwise_kernel_size=9, depthwise_stride=1) in_channels = out_channels # Third Convolutional block, acts as coupling to encoder out_channels = self.out_channels blocks["DConvBlock5"] = self.gen_lsc_block( in_channels, out_channels, depthwise_kernel_size=7, depthwise_stride=1, pointwise_groups=0, ) self.blocks = torch.nn.Sequential(blocks) def gen_lsc_block( self, in_channels: int, out_channels: int, depthwise_kernel_size: int = 9, depthwise_stride: int = 1, depthwise_groups=None, pointwise_groups=0, dropout_probability: float = 0.15, avgpool=False, ): """Generate a convolutional block for Lightweight Sinc convolutions. Each block consists of either a depthwise or a depthwise-separable convolutions together with dropout, (batch-)normalization layer, and an optional average-pooling layer. Args: in_channels: Number of input channels. out_channels: Number of output channels. depthwise_kernel_size: Kernel size of the depthwise convolution. depthwise_stride: Stride of the depthwise convolution. depthwise_groups: Number of groups of the depthwise convolution. pointwise_groups: Number of groups of the pointwise convolution. dropout_probability: Dropout probability in the block. avgpool: If True, an AvgPool layer is inserted. Returns: torch.nn.Sequential: Neural network building block. """ block = OrderedDict() if not depthwise_groups: # GCD(in_channels, out_channels) to prevent size mismatches depthwise_groups, r = in_channels, out_channels while r != 0: depthwise_groups, r = depthwise_groups, depthwise_groups % r block["depthwise"] = torch.nn.Conv1d( in_channels, out_channels, depthwise_kernel_size, depthwise_stride, groups=depthwise_groups, ) if pointwise_groups: block["pointwise"] = torch.nn.Conv1d(out_channels, out_channels, 1, 1, groups=pointwise_groups) block["activation"] = self.choices_activation[self.activation_type]() block["batchnorm"] = torch.nn.BatchNorm1d(out_channels, affine=True) if avgpool: block["avgpool"] = torch.nn.AvgPool1d(2) block["dropout"] = self.choices_dropout[self.dropout_type]( dropout_probability) return torch.nn.Sequential(block) def espnet_initialization_fn(self): """Initialize sinc filters with filterbank values.""" self.filters.init_filters() for block in self.blocks: for layer in block: if type(layer) == torch.nn.BatchNorm1d and layer.affine: layer.weight.data[:] = 1.0 layer.bias.data[:] = 0.0 def forward( self, input: torch.Tensor, input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """Apply Lightweight Sinc Convolutions. The input shall be formatted as (B, T, C_in, D_in) with B as batch size, T as time dimension, C_in as channels, and D_in as feature dimension. The output will then be (B, T, C_out*D_out) with C_out and D_out as output dimensions. The current module structure only handles D_in=400, so that D_out=1. Remark for the multichannel case: C_out is the number of out_channels given at initialization multiplied with C_in. """ # Transform input data: # (B, T, C_in, D_in) -> (B*T, C_in, D_in) B, T, C_in, D_in = input.size() input_frames = input.view(B * T, C_in, D_in) output_frames = self.blocks.forward(input_frames) # ---TRANSFORM: (B*T, C_out, D_out) -> (B, T, C_out*D_out) _, C_out, D_out = output_frames.size() output_frames = output_frames.view(B, T, C_out * D_out) return output_frames, input_lengths # no state in this layer def output_size(self) -> int: """Get the output size.""" return self.out_channels * self.in_channels
def test_sinc_filter_output_size(): sinc_conv = SincConv(in_channels=1, out_channels=128, kernel_size=101) assert sinc_conv.get_odim(400) == 300
def test_sinc_filter_static_functions(): N = 400 x = torch.linspace(1, N, N) print(f"no window function: {SincConv.none_window(x)}") print(f"hamming window function: {SincConv.hamming_window(x)}") SincConv.sinc(torch.tensor(50.0))