def __init__(self,in_channels=1, out_channels=1,kernel_size=3,layers=30,stacks=3,residual_channels=64,gate_channels=128,skip_channels=64,aux_channels=80,aux_context_window=2, dropout=0.0,bias=True,use_weight_norm=True,use_causal_conv=False,upsample_conditional_features=True,upsample_net="ConvInUpsampleNetwork", upsample_params={"upsample_scales": [4, 4, 4, 4]}, ): super(ParallelWaveGANGenerator, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.aux_channels = aux_channels self.layers = layers self.stacks = stacks self.kernel_size = kernel_size # check the number of layers and stacks assert layers % stacks == 0 layers_per_stack = layers // stacks # define first convolution self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True) # define conv + upsampling network if upsample_conditional_features: upsample_params.update({ "use_causal_conv": use_causal_conv, }) if upsample_net == "ConvInUpsampleNetwork": upsample_params.update({ "aux_channels": aux_channels, "aux_context_window": aux_context_window, }) self.upsample_net = getattr(upsample, upsample_net)(**upsample_params) else: self.upsample_net = None # define residual blocks self.conv_layers = torch.nn.ModuleList() for layer in range(layers): dilation = 2 ** (layer % layers_per_stack) conv = ResidualBlock( kernel_size=kernel_size, residual_channels=residual_channels, gate_channels=gate_channels, skip_channels=skip_channels, aux_channels=aux_channels, dilation=dilation, dropout=dropout,bias=bias, use_causal_conv=use_causal_conv,) self.conv_layers += [conv] # define output layers self.last_conv_layers = torch.nn.ModuleList([ torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, skip_channels, bias=True), torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, out_channels, bias=True), ]) # apply weight norm if use_weight_norm: self.apply_weight_norm()
def __init__( self, in_channels=1, out_channels=1, kernel_size=3, layers=30, stacks=3, residual_channels=64, gate_channels=128, skip_channels=64, dropout=0.0, use_weight_norm=True, use_causal_conv=False, nonlinear_activation_params={"negative_slope": 0.2}, ): """Initialize Parallel WaveGAN Discriminator module. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. kernel_size (int): Kernel size of dilated convolution. layers (int): Number of residual block layers. stacks (int): Number of stacks i.e., dilation cycles. residual_channels (int): Number of channels in residual conv. gate_channels (int): Number of channels in gated conv. skip_channels (int): Number of channels in skip conv. dropout (float): Dropout rate. 0.0 means no dropout applied. use_weight_norm (bool): Whether to use weight norm. If set to true, it will be applied to all of the conv layers. use_causal_conv (bool): Whether to use causal structure. nonlinear_activation_params (dict): Nonlinear function parameters """ super(ResidualParallelWaveGANDiscriminator, self).__init__() assert (kernel_size - 1) % 2 == 0, "Not support even number kernel size." self.in_channels = in_channels self.out_channels = out_channels self.layers = layers self.stacks = stacks self.kernel_size = kernel_size # check the number of layers and stacks assert layers % stacks == 0 layers_per_stack = layers // stacks # define first convolution self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True) self.first_conv_activation = torch.nn.LeakyReLU( negative_slope=nonlinear_activation_params['negative_slope'], inplace=True) # define residual blocks self.conv_layers = torch.nn.ModuleList() for layer in range(layers): dilation = 2**(layer % layers_per_stack) conv = ResidualBlock( kernel_size=kernel_size, residual_channels=residual_channels, gate_channels=gate_channels, skip_channels=skip_channels, aux_channels=-1, dilation=dilation, dropout=dropout, bias=True, # NOTE: magenda uses bias, but musyoku doesn't use_causal_conv=use_causal_conv, ) self.conv_layers += [conv] # define output layers self.last_conv_layers = torch.nn.ModuleList([ torch.nn.LeakyReLU( negative_slope=nonlinear_activation_params['negative_slope'], inplace=True), Conv1d1x1(skip_channels, skip_channels, bias=True), torch.nn.LeakyReLU( negative_slope=nonlinear_activation_params['negative_slope'], inplace=True), Conv1d1x1(skip_channels, out_channels, bias=True), ]) # apply weight norm if use_weight_norm: self.apply_weight_norm()
def __init__( self, in_channels=1, out_channels=1, kernel_size=3, layers=30, stacks=3, residual_channels=64, gate_channels=128, skip_channels=64, aux_channels=80, aux_context_window=2, dropout=0.0, use_weight_norm=True, use_causal_conv=False, upsample_conditional_features=True, upsample_net="ConvInUpsampleNetwork", upsample_params={"upsample_scales": [4, 4, 4, 4]}, ): """Initialize Parallel WaveGAN Generator module. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. kernel_size (int): Kernel size of dilated convolution. layers (int): Number of residual block layers. stacks (int): Number of stacks i.e., dilation cycles. residual_channels (int): Number of channels in residual conv. gate_channels (int): Number of channels in gated conv. skip_channels (int): Number of channels in skip conv. aux_channels (int): Number of channels for auxiliary feature conv. aux_context_window (int): Context window size for auxiliary feature. dropout (float): Dropout rate. 0.0 means no dropout applied. use_weight_norm (bool): Whether to use weight norm. If set to true, it will be applied to all of the conv layers. use_causal_conv (bool): Whether to use causal structure. upsample_conditional_features (bool): Whether to use upsampling network. upsample_net (str): Upsampling network architecture. upsample_params (dict): Upsampling network parameters. """ super(ParallelWaveGANGenerator, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.aux_channels = aux_channels self.layers = layers self.stacks = stacks self.kernel_size = kernel_size # check the number of layers and stacks assert layers % stacks == 0 layers_per_stack = layers // stacks # define first convolution self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True) # define conv + upsampling network if upsample_conditional_features: upsample_params.update({ "use_causal_conv": use_causal_conv, }) if upsample_net == "ConvInUpsampleNetwork": upsample_params.update({ "aux_channels": aux_channels, "aux_context_window": aux_context_window, }) self.upsample_net = getattr(upsample, upsample_net)(**upsample_params) else: self.upsample_net = None # define residual blocks self.conv_layers = torch.nn.ModuleList() for layer in range(layers): dilation = 2**(layer % layers_per_stack) conv = ResidualBlock( kernel_size=kernel_size, residual_channels=residual_channels, gate_channels=gate_channels, skip_channels=skip_channels, aux_channels=aux_channels, dilation=dilation, dropout=dropout, bias=True, # NOTE: magenda uses bias, but musyoku doesn't use_causal_conv=use_causal_conv, ) self.conv_layers += [conv] # define output layers self.last_conv_layers = torch.nn.ModuleList([ torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, skip_channels, bias=True), torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, out_channels, bias=True), ]) # apply weight norm if use_weight_norm: self.apply_weight_norm()
def __init__(self, in_channels=1, out_channels=1, kernel_size=3, layers=30, stacks=3, residual_channels=64, gate_channels=128, skip_channels=64, aux_channels=80, aux_context_window=2, dropout=0.0, bias=True, use_weight_norm=True, use_causal_conv=False, upsample_conditional_features=True, upsample_net="ConvInUpsampleNetwork", upsample_params={"upsample_scales": [4, 4, 4, 4]}, # 4, 5, 3, 5 ): """Initialize Parallel WaveGAN Generator module. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. kernel_size (int): Kernel size of dilated convolution. layers (int): Number of residual block layers. stacks (int): Number of stacks i.e., dilation cycles. residual_channels (int): Number of channels in residual conv. gate_channels (int): Number of channels in gated conv. skip_channels (int): Number of channels in skip conv. aux_channels (int): Number of channels for auxiliary feature conv. aux_context_window (int): Context window size for auxiliary feature. dropout (float): Dropout rate. 0.0 means no dropout applied. bias (bool): Whether to use bias parameter in conv layer. use_weight_norm (bool): Whether to use weight norm. If set to true, it will be applied to all of the conv layers. use_causal_conv (bool): Whether to use causal structure. upsample_conditional_features (bool): Whether to use upsampling network. upsample_net (str): Upsampling network architecture. upsample_params (dict): Upsampling network parameters. """ super(ParallelWaveGANGenerator, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.aux_channels = aux_channels self.aux_context_window = aux_context_window self.layers = layers self.stacks = stacks self.kernel_size = kernel_size # check the number of layers and stacks assert layers % stacks == 0 layers_per_stack = layers // stacks # define first convolution # 1 -> 64 # B 64 L self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True) # define conv + upsampling network if upsample_conditional_features: upsample_params.update({ "use_causal_conv": use_causal_conv, }) if upsample_net == "MelGANGenerator": assert aux_context_window == 0 upsample_params.update({ "use_weight_norm": False, # not to apply twice "use_final_nonlinear_activation": False, }) self.upsample_net = getattr(models, upsample_net)(**upsample_params) else: if upsample_net == "ConvInUpsampleNetwork": # 4, 5, 3, 5 upsample scales # following two params are not used upsample_params.update({ "aux_channels": aux_channels, # 80 "aux_context_window": aux_context_window, # 2 }) # in conv, 80->80, 2*2win+1 kernel, padded by 2win already: T -> T-2win # upsample: # stretch:B 1 80 T -> B 1 80 (T-2win)*scale: 4 5 3 5, nearest neighbor upsampling # conv2d:1, 1, 9 11 7 11, 4 5 3 5 # no bias # return B 80 (T-2win)*scale:L # note: every node will connect to 2 nodes in next layer, and with different distance for adjacent layers, this will not cause the same gradient. self.upsample_net = getattr(upsample, upsample_net)(**upsample_params) self.upsample_factor = np.prod(upsample_params["upsample_scales"]) # 300, hop size else: self.upsample_net = None self.upsample_factor = 1 # define residual blocks self.conv_layers = torch.nn.ModuleList() for layer in range(layers): dilation = 2 ** (layer % layers_per_stack) # 1. conv1d kernel 3, dilated: B 64 L -> B 128 L # 2. B 128 L -> B 64 L, B 64 L # 3. B 80 L 1x1 -> B 128 L -> B 64 L, B 64 L # 4. tanh * sigmoid of the sums # 5. 1x1 -> skip channel B 64 L # 6. 1x1 res 1/sqrt(2) -> res channel B 64 L conv = ResidualBlock( kernel_size=kernel_size, #1 residual_channels=residual_channels, # 64 gate_channels=gate_channels, # 128 skip_channels=skip_channels, # 64 aux_channels=aux_channels, # 80 dilation=dilation, dropout=dropout, # 0 bias=bias, # True use_causal_conv=use_causal_conv, # False ) self.conv_layers += [conv] # define output layers # 64->64->1 self.last_conv_layers = torch.nn.ModuleList([ torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, skip_channels, bias=True), torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, out_channels, bias=True), ]) # apply weight norm if use_weight_norm: # True self.apply_weight_norm()
def __init__( self, in_channels=80, out_channels=1, channels=512, kernel_size=7, upsample_scales=(8, 8, 2, 2), upsample_kernel_sizes=(16, 16, 4, 4), resblock_kernel_sizes=(3, 7, 11), resblock_dilations=[(1, 3, 5), (1, 3, 5), (1, 3, 5)], use_additional_convs=True, bias=True, nonlinear_activation="LeakyReLU", nonlinear_activation_params={"negative_slope": 0.1}, use_causal_conv=False, use_weight_norm=True, ): """Initialize HiFiGANGenerator module. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. channels (int): Number of hidden representation channels. kernel_size (int): Kernel size of initial and final conv layer. upsample_scales (list): List of upsampling scales. upsample_kernel_sizes (list): List of kernel sizes for upsampling layers. resblock_kernel_sizes (list): List of kernel sizes for residual blocks. resblock_dilations (list): List of dilation list for residual blocks. use_additional_convs (bool): Whether to use additional conv layers in residual blocks. bias (bool): Whether to add bias parameter in convolution layers. nonlinear_activation (str): Activation function module name. nonlinear_activation_params (dict): Hyperparameters for activation function. use_causal_conv (bool): Whether to use causal structure. use_weight_norm (bool): Whether to use weight norm. If set to true, it will be applied to all of the conv layers. """ super().__init__() # check hyperparameters are valid assert kernel_size % 2 == 1, "Kernel size must be odd number." assert len(upsample_scales) == len(upsample_kernel_sizes) assert len(resblock_dilations) == len(resblock_kernel_sizes) # define modules self.num_upsamples = len(upsample_kernel_sizes) self.num_blocks = len(resblock_kernel_sizes) self.use_causal_conv = use_causal_conv if not use_causal_conv: self.input_conv = torch.nn.Conv1d( in_channels, channels, kernel_size, bias=bias, padding=(kernel_size - 1) // 2, ) else: self.input_conv = CausalConv1d( in_channels, channels, kernel_size, bias=bias, ) self.upsamples = torch.nn.ModuleList() self.blocks = torch.nn.ModuleList() for i in range(len(upsample_kernel_sizes)): assert upsample_kernel_sizes[i] == 2 * upsample_scales[i] if not use_causal_conv: self.upsamples += [ torch.nn.Sequential( getattr(torch.nn, nonlinear_activation)( **nonlinear_activation_params), torch.nn.ConvTranspose1d( channels // (2**i), channels // (2**(i + 1)), upsample_kernel_sizes[i], upsample_scales[i], padding=upsample_scales[i] // 2 + upsample_scales[i] % 2, output_padding=upsample_scales[i] % 2, bias=bias, ), ) ] else: self.upsamples += [ torch.nn.Sequential( getattr(torch.nn, nonlinear_activation)( **nonlinear_activation_params), CausalConvTranspose1d( channels // (2**i), channels // (2**(i + 1)), upsample_kernel_sizes[i], upsample_scales[i], bias=bias, ), ) ] for j in range(len(resblock_kernel_sizes)): self.blocks += [ ResidualBlock( kernel_size=resblock_kernel_sizes[j], channels=channels // (2**(i + 1)), dilations=resblock_dilations[j], bias=bias, use_additional_convs=use_additional_convs, nonlinear_activation=nonlinear_activation, nonlinear_activation_params=nonlinear_activation_params, use_causal_conv=use_causal_conv, ) ] if not use_causal_conv: self.output_conv = torch.nn.Sequential( # NOTE(kan-bayashi): follow official implementation but why # using different slope parameter here? (0.1 vs. 0.01) torch.nn.LeakyReLU(), torch.nn.Conv1d( channels // (2**(i + 1)), out_channels, kernel_size, bias=bias, padding=(kernel_size - 1) // 2, ), torch.nn.Tanh(), ) else: self.output_conv = torch.nn.Sequential( # NOTE(kan-bayashi): follow official implementation but why # using different slope parameter here? (0.1 vs. 0.01) torch.nn.LeakyReLU(), CausalConv1d( channels // (2**(i + 1)), out_channels, kernel_size, bias=bias, ), torch.nn.Tanh(), ) # apply weight norm if use_weight_norm: self.apply_weight_norm() # reset parameters self.reset_parameters()
def __init__(self, in_channels=1, out_channels=1, kernel_size=3, layers=30, stacks=3, residual_channels=64, gate_channels=128, skip_channels=64, aux_channels=80, aux_context_window=2, dropout=0.0, bias=True, use_weight_norm=True, use_causal_conv=False, upsample_conditional_features=True, upsample_net="ConvInUpsampleNetwork", upsample_params={"upsample_scales": [4, 4, 4, 4]}, use_asr_layer=False, asr_config=None, asr_pretrained_file=None, asr_feature_layer_nth=-1 ): """Initialize Parallel WaveGAN Generator module. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. kernel_size (int): Kernel size of dilated convolution. layers (int): Number of residual block layers. stacks (int): Number of stacks i.e., dilation cycles. residual_channels (int): Number of channels in residual conv. gate_channels (int): Number of channels in gated conv. skip_channels (int): Number of channels in skip conv. aux_channels (int): Number of channels for auxiliary feature conv. aux_context_window (int): Context window size for auxiliary feature. dropout (float): Dropout rate. 0.0 means no dropout applied. bias (bool): Whether to use bias parameter in conv layer. use_weight_norm (bool): Whether to use weight norm. If set to true, it will be applied to all of the conv layers. use_causal_conv (bool): Whether to use causal structure. upsample_conditional_features (bool): Whether to use upsampling network. upsample_net (str): Upsampling network architecture. upsample_params (dict): Upsampling network parameters. """ super(ParallelWaveGANGenerator, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.aux_channels = aux_channels self.aux_context_window = aux_context_window self.layers = layers self.stacks = stacks self.kernel_size = kernel_size # check the number of layers and stacks assert layers % stacks == 0 layers_per_stack = layers // stacks # define first convolution self.first_conv = Conv1d1x1(in_channels, residual_channels, bias=True) # define conv + upsampling network if upsample_conditional_features: upsample_params.update({ "use_causal_conv": use_causal_conv, }) if upsample_net == "MelGANGenerator": assert aux_context_window == 0 upsample_params.update({ "use_weight_norm": False, # not to apply twice "use_final_nonlinear_activation": False, }) self.upsample_net = getattr(models, upsample_net)(**upsample_params) else: if upsample_net == "ConvInUpsampleNetwork": upsample_params.update({ "aux_channels": aux_channels, "aux_context_window": aux_context_window, }) self.upsample_net = getattr(upsample, upsample_net)(**upsample_params) self.upsample_factor = np.prod(upsample_params["upsample_scales"]) else: self.upsample_net = None self.upsample_factor = 1 # define residual blocks self.conv_layers = torch.nn.ModuleList() for layer in range(layers): dilation = 2 ** (layer % layers_per_stack) conv = ResidualBlock( kernel_size=kernel_size, residual_channels=residual_channels, gate_channels=gate_channels, skip_channels=skip_channels, aux_channels=aux_channels, dilation=dilation, dropout=dropout, bias=bias, use_causal_conv=use_causal_conv, ) self.conv_layers += [conv] # define output layers self.last_conv_layers = torch.nn.ModuleList([ torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, skip_channels, bias=True), torch.nn.ReLU(inplace=True), Conv1d1x1(skip_channels, out_channels, bias=True), ]) if use_asr_layer: conf = OmegaConf.load(asr_config) encoder_conf = conf.model.encoder jasper = encoder_conf.jasper feat_in = encoder_conf.feat_in activation = encoder_conf.activation conv_mask = encoder_conf.conv_mask self.asr_layer = ConvASREncoder(jasper, activation, feat_in, conv_mask=conv_mask) for m in self.asr_layer.modules(): if isinstance(m, MaskedConv1d): m.use_mask = False else: self.asr_layer = None self.asr_feature_layer_nth = asr_feature_layer_nth # apply weight norm if use_weight_norm: self.apply_weight_norm()