def __init__(self,in_channels=1, out_channels=1,kernel_size=3,layers=30,stacks=3,residual_channels=64,gate_channels=128,skip_channels=64,aux_channels=80,aux_context_window=2, dropout=0.0,bias=True,use_weight_norm=True,use_causal_conv=False,upsample_conditional_features=True,upsample_net="ConvInUpsampleNetwork", upsample_params={"upsample_scales": [4, 4, 4, 4]}, ): super(ParallelWaveGANGenerator, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.aux_channels = aux_channels self.layers = layers self.stacks = stacks self.kernel_size = kernel_size # check the number of layers and stacks assert layers % stacks == 0 layers_per_stack = layers // stacks # define first convolution self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True) # define conv + upsampling network if upsample_conditional_features: upsample_params.update({ "use_causal_conv": use_causal_conv, }) if upsample_net == "ConvInUpsampleNetwork": upsample_params.update({ "aux_channels": aux_channels, "aux_context_window": aux_context_window, }) self.upsample_net = getattr(upsample, upsample_net)(**upsample_params) else: self.upsample_net = None # define residual blocks self.conv_layers = torch.nn.ModuleList() for layer in range(layers): dilation = 2 ** (layer % layers_per_stack) conv = ResidualBlock( kernel_size=kernel_size, residual_channels=residual_channels, gate_channels=gate_channels, skip_channels=skip_channels, aux_channels=aux_channels, dilation=dilation, dropout=dropout,bias=bias, use_causal_conv=use_causal_conv,) self.conv_layers += [conv] # define output layers self.last_conv_layers = torch.nn.ModuleList([ torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, skip_channels, bias=True), torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, out_channels, bias=True), ]) # apply weight norm if use_weight_norm: self.apply_weight_norm()
def test_conv_initialization(): conv = Conv1d(10, 10, 3, bias=True) np.testing.assert_array_equal(conv.bias.data.numpy(), np.zeros_like(conv.bias.data.numpy())) conv1x1 = Conv1d1x1(10, 10, bias=True) np.testing.assert_array_equal(conv1x1.bias.data.numpy(), np.zeros_like(conv1x1.bias.data.numpy())) kernel_size = (10, 10) conv2d = Conv2d(10, 10, kernel_size, bias=True) np.testing.assert_array_equal( conv2d.weight.data.numpy(), np.ones_like(conv2d.weight.data.numpy()) / np.prod(kernel_size)) np.testing.assert_array_equal(conv2d.bias.data.numpy(), np.zeros_like(conv2d.bias.data.numpy())) kernel_size = (1, 10) conv2d = Conv2d(10, 10, kernel_size, bias=True) np.testing.assert_array_equal( conv2d.weight.data.numpy(), np.ones_like(conv2d.weight.data.numpy()) / np.prod(kernel_size)) np.testing.assert_array_equal(conv2d.bias.data.numpy(), np.zeros_like(conv2d.bias.data.numpy()))
def __init__( self, in_channels=1, out_channels=1, kernel_size=3, layers=30, stacks=3, residual_channels=64, gate_channels=128, skip_channels=64, dropout=0.0, use_weight_norm=True, use_causal_conv=False, nonlinear_activation_params={"negative_slope": 0.2}, ): """Initialize Parallel WaveGAN Discriminator module. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. kernel_size (int): Kernel size of dilated convolution. layers (int): Number of residual block layers. stacks (int): Number of stacks i.e., dilation cycles. residual_channels (int): Number of channels in residual conv. gate_channels (int): Number of channels in gated conv. skip_channels (int): Number of channels in skip conv. dropout (float): Dropout rate. 0.0 means no dropout applied. use_weight_norm (bool): Whether to use weight norm. If set to true, it will be applied to all of the conv layers. use_causal_conv (bool): Whether to use causal structure. nonlinear_activation_params (dict): Nonlinear function parameters """ super(ResidualParallelWaveGANDiscriminator, self).__init__() assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." self.in_channels = in_channels self.out_channels = out_channels self.layers = layers self.stacks = stacks self.kernel_size = kernel_size # check the number of layers and stacks assert layers % stacks == 0 layers_per_stack = layers // stacks # define first convolution self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True) self.first_conv_activation = torch.nn.LeakyReLU( negative_slope=nonlinear_activation_params['negative_slope'], inplace=True) # define residual blocks self.conv_layers = torch.nn.ModuleList() for layer in range(layers): dilation = 2**(layer % layers_per_stack) conv = ResidualBlock( kernel_size=kernel_size, residual_channels=residual_channels, gate_channels=gate_channels, skip_channels=skip_channels, aux_channels=-1, dilation=dilation, dropout=dropout, bias=True, # NOTE: magenda uses bias, but musyoku doesn't use_causal_conv=use_causal_conv, ) self.conv_layers += [conv] # define output layers self.last_conv_layers = torch.nn.ModuleList([ torch.nn.LeakyReLU( negative_slope=nonlinear_activation_params['negative_slope'], inplace=True), Conv1d1x1(skip_channels, skip_channels, bias=True), torch.nn.LeakyReLU( negative_slope=nonlinear_activation_params['negative_slope'], inplace=True), Conv1d1x1(skip_channels, out_channels, bias=True), ]) # apply weight norm if use_weight_norm: self.apply_weight_norm()
def __init__( self, in_channels=1, out_channels=1, kernel_size=3, layers=30, stacks=3, residual_channels=64, gate_channels=128, skip_channels=64, aux_channels=80, aux_context_window=2, dropout=0.0, use_weight_norm=True, use_causal_conv=False, upsample_conditional_features=True, upsample_net="ConvInUpsampleNetwork", upsample_params={"upsample_scales": [4, 4, 4, 4]}, ): """Initialize Parallel WaveGAN Generator module. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. kernel_size (int): Kernel size of dilated convolution. layers (int): Number of residual block layers. stacks (int): Number of stacks i.e., dilation cycles. residual_channels (int): Number of channels in residual conv. gate_channels (int): Number of channels in gated conv. skip_channels (int): Number of channels in skip conv. aux_channels (int): Number of channels for auxiliary feature conv. aux_context_window (int): Context window size for auxiliary feature. dropout (float): Dropout rate. 0.0 means no dropout applied. use_weight_norm (bool): Whether to use weight norm. If set to true, it will be applied to all of the conv layers. use_causal_conv (bool): Whether to use causal structure. upsample_conditional_features (bool): Whether to use upsampling network. upsample_net (str): Upsampling network architecture. upsample_params (dict): Upsampling network parameters. """ super(ParallelWaveGANGenerator, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.aux_channels = aux_channels self.layers = layers self.stacks = stacks self.kernel_size = kernel_size # check the number of layers and stacks assert layers % stacks == 0 layers_per_stack = layers // stacks # define first convolution self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True) # define conv + upsampling network if upsample_conditional_features: upsample_params.update({ "use_causal_conv": use_causal_conv, }) if upsample_net == "ConvInUpsampleNetwork": upsample_params.update({ "aux_channels": aux_channels, "aux_context_window": aux_context_window, }) self.upsample_net = getattr(upsample, upsample_net)(**upsample_params) else: self.upsample_net = None # define residual blocks self.conv_layers = torch.nn.ModuleList() for layer in range(layers): dilation = 2**(layer % layers_per_stack) conv = ResidualBlock( kernel_size=kernel_size, residual_channels=residual_channels, gate_channels=gate_channels, skip_channels=skip_channels, aux_channels=aux_channels, dilation=dilation, dropout=dropout, bias=True, # NOTE: magenda uses bias, but musyoku doesn't use_causal_conv=use_causal_conv, ) self.conv_layers += [conv] # define output layers self.last_conv_layers = torch.nn.ModuleList([ torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, skip_channels, bias=True), torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, out_channels, bias=True), ]) # apply weight norm if use_weight_norm: self.apply_weight_norm()
def __init__(self, in_channels=1, out_channels=1, kernel_size=3, layers=30, stacks=3, residual_channels=64, gate_channels=128, skip_channels=64, aux_channels=80, aux_context_window=2, dropout=0.0, bias=True, use_weight_norm=True, use_causal_conv=False, upsample_conditional_features=True, upsample_net="ConvInUpsampleNetwork", upsample_params={"upsample_scales": [4, 4, 4, 4]}, # 4, 5, 3, 5 ): """Initialize Parallel WaveGAN Generator module. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. kernel_size (int): Kernel size of dilated convolution. layers (int): Number of residual block layers. stacks (int): Number of stacks i.e., dilation cycles. residual_channels (int): Number of channels in residual conv. gate_channels (int): Number of channels in gated conv. skip_channels (int): Number of channels in skip conv. aux_channels (int): Number of channels for auxiliary feature conv. aux_context_window (int): Context window size for auxiliary feature. dropout (float): Dropout rate. 0.0 means no dropout applied. bias (bool): Whether to use bias parameter in conv layer. use_weight_norm (bool): Whether to use weight norm. If set to true, it will be applied to all of the conv layers. use_causal_conv (bool): Whether to use causal structure. upsample_conditional_features (bool): Whether to use upsampling network. upsample_net (str): Upsampling network architecture. upsample_params (dict): Upsampling network parameters. """ super(ParallelWaveGANGenerator, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.aux_channels = aux_channels self.aux_context_window = aux_context_window self.layers = layers self.stacks = stacks self.kernel_size = kernel_size # check the number of layers and stacks assert layers % stacks == 0 layers_per_stack = layers // stacks # define first convolution # 1 -> 64 # B 64 L self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True) # define conv + upsampling network if upsample_conditional_features: upsample_params.update({ "use_causal_conv": use_causal_conv, }) if upsample_net == "MelGANGenerator": assert aux_context_window == 0 upsample_params.update({ "use_weight_norm": False, # not to apply twice "use_final_nonlinear_activation": False, }) self.upsample_net = getattr(models, upsample_net)(**upsample_params) else: if upsample_net == "ConvInUpsampleNetwork": # 4, 5, 3, 5 upsample scales # following two params are not used upsample_params.update({ "aux_channels": aux_channels, # 80 "aux_context_window": aux_context_window, # 2 }) # in conv, 80->80, 2*2win+1 kernel, padded by 2win already: T -> T-2win # upsample: # stretch:B 1 80 T -> B 1 80 (T-2win)*scale: 4 5 3 5, nearest neighbor upsampling # conv2d:1, 1, 9 11 7 11, 4 5 3 5 # no bias # return B 80 (T-2win)*scale:L # note: every node will connect to 2 nodes in next layer, and with different distance for adjacent layers, this will not cause the same gradient. self.upsample_net = getattr(upsample, upsample_net)(**upsample_params) self.upsample_factor = np.prod(upsample_params["upsample_scales"]) # 300, hop size else: self.upsample_net = None self.upsample_factor = 1 # define residual blocks self.conv_layers = torch.nn.ModuleList() for layer in range(layers): dilation = 2 ** (layer % layers_per_stack) # 1. conv1d kernel 3, dilated: B 64 L -> B 128 L # 2. B 128 L -> B 64 L, B 64 L # 3. B 80 L 1x1 -> B 128 L -> B 64 L, B 64 L # 4. tanh * sigmoid of the sums # 5. 1x1 -> skip channel B 64 L # 6. 1x1 res 1/sqrt(2) -> res channel B 64 L conv = ResidualBlock( kernel_size=kernel_size, #1 residual_channels=residual_channels, # 64 gate_channels=gate_channels, # 128 skip_channels=skip_channels, # 64 aux_channels=aux_channels, # 80 dilation=dilation, dropout=dropout, # 0 bias=bias, # True use_causal_conv=use_causal_conv, # False ) self.conv_layers += [conv] # define output layers # 64->64->1 self.last_conv_layers = torch.nn.ModuleList([ torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, skip_channels, bias=True), torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, out_channels, bias=True), ]) # apply weight norm if use_weight_norm: # True self.apply_weight_norm()
def __init__(self, in_channels=1, out_channels=1, kernel_size=3, layers=30, stacks=3, residual_channels=64, gate_channels=128, skip_channels=64, aux_channels=80, aux_context_window=2, dropout=0.0, bias=True, use_weight_norm=True, use_causal_conv=False, upsample_conditional_features=True, upsample_net="ConvInUpsampleNetwork", upsample_params={"upsample_scales": [4, 4, 4, 4]}, use_asr_layer=False, asr_config=None, asr_pretrained_file=None, asr_feature_layer_nth=-1 ): """Initialize Parallel WaveGAN Generator module. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. kernel_size (int): Kernel size of dilated convolution. layers (int): Number of residual block layers. stacks (int): Number of stacks i.e., dilation cycles. residual_channels (int): Number of channels in residual conv. gate_channels (int): Number of channels in gated conv. skip_channels (int): Number of channels in skip conv. aux_channels (int): Number of channels for auxiliary feature conv. aux_context_window (int): Context window size for auxiliary feature. dropout (float): Dropout rate. 0.0 means no dropout applied. bias (bool): Whether to use bias parameter in conv layer. use_weight_norm (bool): Whether to use weight norm. If set to true, it will be applied to all of the conv layers. use_causal_conv (bool): Whether to use causal structure. upsample_conditional_features (bool): Whether to use upsampling network. upsample_net (str): Upsampling network architecture. upsample_params (dict): Upsampling network parameters. """ super(ParallelWaveGANGenerator, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.aux_channels = aux_channels self.aux_context_window = aux_context_window self.layers = layers self.stacks = stacks self.kernel_size = kernel_size # check the number of layers and stacks assert layers % stacks == 0 layers_per_stack = layers // stacks # define first convolution self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True) # define conv + upsampling network if upsample_conditional_features: upsample_params.update({ "use_causal_conv": use_causal_conv, }) if upsample_net == "MelGANGenerator": assert aux_context_window == 0 upsample_params.update({ "use_weight_norm": False, # not to apply twice "use_final_nonlinear_activation": False, }) self.upsample_net = getattr(models, upsample_net)(**upsample_params) else: if upsample_net == "ConvInUpsampleNetwork": upsample_params.update({ "aux_channels": aux_channels, "aux_context_window": aux_context_window, }) self.upsample_net = getattr(upsample, upsample_net)(**upsample_params) self.upsample_factor = np.prod(upsample_params["upsample_scales"]) else: self.upsample_net = None self.upsample_factor = 1 # define residual blocks self.conv_layers = torch.nn.ModuleList() for layer in range(layers): dilation = 2 ** (layer % layers_per_stack) conv = ResidualBlock( kernel_size=kernel_size, residual_channels=residual_channels, gate_channels=gate_channels, skip_channels=skip_channels, aux_channels=aux_channels, dilation=dilation, dropout=dropout, bias=bias, use_causal_conv=use_causal_conv, ) self.conv_layers += [conv] # define output layers self.last_conv_layers = torch.nn.ModuleList([ torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, skip_channels, bias=True), torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, out_channels, bias=True), ]) if use_asr_layer: conf = OmegaConf.load(asr_config) encoder_conf = conf.model.encoder jasper = encoder_conf.jasper feat_in = encoder_conf.feat_in activation = encoder_conf.activation conv_mask = encoder_conf.conv_mask self.asr_layer = ConvASREncoder(jasper, activation, feat_in, conv_mask=conv_mask) for m in self.asr_layer.modules(): if isinstance(m, MaskedConv1d): m.use_mask = False else: self.asr_layer = None self.asr_feature_layer_nth = asr_feature_layer_nth # apply weight norm if use_weight_norm: self.apply_weight_norm()