def __init__(self, labels=256, layers=20, stacks=2, residual_channels=512, gate_channels=512, skip_out_channels=512, kernel_size=3, dropout=1 - 0.95, cin_channels=-1, gin_channels=-1, n_speakers=None, weight_normalization=True, upsample_conditional_features=False, upsample_scales=None): super(WaveNet, self).__init__() self.labels = labels assert layers % stacks == 0 layers_per_stack = layers // stacks self.first_conv = Conv1d1x1(labels, residual_channels) self.conv_layers = nn.ModuleList() for layer in range(layers): dilation = 2**(layer % layers_per_stack) conv = ResidualConv1dGLU( residual_channels, gate_channels, kernel_size=kernel_size, skip_out_channels=skip_out_channels, bias=True, # magenda uses bias, but musyoku doesn't dilation=dilation, dropout=dropout, cin_channels=cin_channels, gin_channels=gin_channels, weight_normalization=weight_normalization) self.conv_layers.append(conv) self.last_conv_layers = nn.ModuleList([ nn.ReLU(inplace=True), Conv1d1x1(skip_out_channels, skip_out_channels, weight_normalization=weight_normalization), nn.ReLU(inplace=True), Conv1d1x1(skip_out_channels, labels, weight_normalization=weight_normalization), ]) if gin_channels > 0: assert n_speakers is not None self.embed_speakers = Embedding( n_speakers, gin_channels, padding_idx=None, std=0.1) # Upsample conv net if upsample_conditional_features: self.upsample_conv = nn.ModuleList() for s in upsample_scales: convt = ConvTranspose1d( cin_channels, cin_channels, kernel_size=s, padding=0, dilation=1, stride=s, std_mul=1.0) convt.bias.data.zero_() convt.weight.data.fill_(1 / cin_channels) self.upsample_conv.append(convt) # Is this non-lineality necessary? self.upsample_conv.append(nn.ReLU(inplace=True)) else: self.upsample_conv = None self.receptive_field = receptive_field_size(layers, stacks, kernel_size)
def __init__( self, out_channels=256, layers=20, stacks=2, residual_channels=512, gate_channels=512, skip_out_channels=512, kernel_size=3, dropout=1 - 0.95, cin_channels=-1, gin_channels=-1, n_speakers=None, weight_normalization=True, upsample_conditional_features=False, upsample_scales=None, freq_axis_kernel_size=3, scalar_input=False, ): super(WaveNet, self).__init__() self.scalar_input = scalar_input self.out_channels = out_channels self.cin_channels = cin_channels assert layers % stacks == 0 layers_per_stack = layers // stacks if scalar_input: self.first_conv = Conv1d1x1(1, residual_channels) else: self.first_conv = Conv1d1x1(out_channels, residual_channels) self.conv_layers = nn.ModuleList() for layer in range(layers): dilation = 2**(layer % layers_per_stack) conv = ResidualConv1dGLU( residual_channels, gate_channels, kernel_size=kernel_size, skip_out_channels=skip_out_channels, bias=True, # magenda uses bias, but musyoku doesn't dilation=dilation, dropout=dropout, cin_channels=cin_channels, gin_channels=gin_channels, weight_normalization=weight_normalization) self.conv_layers.append(conv) self.last_conv_layers = nn.ModuleList([ nn.ReLU(inplace=True), Conv1d1x1(skip_out_channels, skip_out_channels, weight_normalization=weight_normalization), nn.ReLU(inplace=True), Conv1d1x1(skip_out_channels, out_channels, weight_normalization=weight_normalization), ]) if gin_channels > 0: assert n_speakers is not None self.embed_speakers = Embedding(n_speakers, gin_channels, padding_idx=None, std=0.1) else: self.embed_speakers = None # Upsample conv net if upsample_conditional_features: self.upsample_conv = nn.ModuleList() for s in upsample_scales: freq_axis_padding = (freq_axis_kernel_size - 1) // 2 convt = ConvTranspose2d( 1, 1, (freq_axis_kernel_size, s), padding=(freq_axis_padding, 0), dilation=1, stride=(1, s), weight_normalization=weight_normalization) self.upsample_conv.append(convt) # assuming we use [0, 1] scaled features # this should avoid non-negative upsampling output self.upsample_conv.append(nn.ReLU(inplace=True)) else: self.upsample_conv = None self.receptive_field = receptive_field_size(layers, stacks, kernel_size)
def __init__( self, out_channels=2, layers=30, stacks=3, iaf_layer_size=[10, 10, 10, 30], # iaf_layer_size=[10, 30], residual_channels=64, gate_channels=64, # skip_out_channels=-1, kernel_size=3, dropout=1 - 0.95, cin_channels=-1, gin_channels=-1, n_speakers=None, weight_normalization=True, upsample_conditional_features=False, upsample_scales=None, freq_axis_kernel_size=3, scalar_input=True, is_student=True, ): super(StudentWaveNet, self).__init__() self.scalar_input = scalar_input self.out_channels = out_channels self.cin_channels = cin_channels self.is_student = is_student self.last_layers = [] # 噪声 assert layers % stacks == 0 layers_per_stack = layers // stacks if scalar_input: self.first_conv = nn.ModuleList([ Conv1d1x1(1, residual_channels) for _ in range(len(iaf_layer_size)) ]) else: self.first_conv = nn.ModuleList([ Conv1d1x1(out_channels, residual_channels) for _ in range(len(iaf_layer_size)) ]) self.iaf_layers = nn.ModuleList() # iaf层 self.last_layers = nn.ModuleList() for layer_size in iaf_layer_size: # build iaf layers -->4 layers by size 10,10,10,30 # IAF LAYERS iaf_layer = nn.ModuleList() for layer in range(layer_size): dilation = 2**(layer % layers_per_stack) conv = ResidualConv1dGLU( residual_channels, gate_channels, kernel_size=kernel_size, bias=True, # magenda uses bias, but musyoku doesn't dilation=dilation, dropout=dropout, cin_channels=cin_channels, gin_channels=gin_channels, weight_normalization=weight_normalization) iaf_layer.append(conv) self.iaf_layers.append(iaf_layer) self.last_layers.append( nn.ModuleList([ # iaf的最后一层 nn.ReLU(inplace=True), Conv1d1x1(residual_channels, out_channels, weight_normalization=weight_normalization), # nn.ReLU(inplace=True), # Conv1d1x1(residual_channels, out_channels, weight_normalization=weight_normalization), ])) if gin_channels > 0: assert n_speakers is not None self.embed_speakers = Embedding(n_speakers, gin_channels, padding_idx=None, std=0.1) else: self.embed_speakers = None # Upsample conv net if upsample_conditional_features: self.upsample_conv = nn.ModuleList() for s in upsample_scales: freq_axis_padding = (freq_axis_kernel_size - 1) // 2 convt = ConvTranspose2d( 1, 1, (freq_axis_kernel_size, s), padding=(freq_axis_padding, 0), dilation=1, stride=(1, s), weight_normalization=weight_normalization) self.upsample_conv.append(convt) # assuming we use [0, 1] scaled features # this should avoid non-negative upsampling output self.upsample_conv.append(nn.ReLU(inplace=True)) else: self.upsample_conv = None self.receptive_field = receptive_field_size(layers, stacks, kernel_size)