Esempio n. 1
0
    def __init__(self, labels=256, layers=20, stacks=2,
                 residual_channels=512,
                 gate_channels=512,
                 skip_out_channels=512,
                 kernel_size=3, dropout=1 - 0.95,
                 cin_channels=-1, gin_channels=-1, n_speakers=None,
                 weight_normalization=True,
                 upsample_conditional_features=False,
                 upsample_scales=None):
        super(WaveNet, self).__init__()
        self.labels = labels
        assert layers % stacks == 0
        layers_per_stack = layers // stacks
        self.first_conv = Conv1d1x1(labels, residual_channels)
        self.conv_layers = nn.ModuleList()
        for layer in range(layers):
            dilation = 2**(layer % layers_per_stack)
            conv = ResidualConv1dGLU(
                residual_channels, gate_channels,
                kernel_size=kernel_size,
                skip_out_channels=skip_out_channels,
                bias=True,  # magenda uses bias, but musyoku doesn't
                dilation=dilation, dropout=dropout,
                cin_channels=cin_channels,
                gin_channels=gin_channels,
                weight_normalization=weight_normalization)
            self.conv_layers.append(conv)
        self.last_conv_layers = nn.ModuleList([
            nn.ReLU(inplace=True),
            Conv1d1x1(skip_out_channels, skip_out_channels,
                      weight_normalization=weight_normalization),
            nn.ReLU(inplace=True),
            Conv1d1x1(skip_out_channels, labels,
                      weight_normalization=weight_normalization),
        ])

        if gin_channels > 0:
            assert n_speakers is not None
            self.embed_speakers = Embedding(
                n_speakers, gin_channels, padding_idx=None, std=0.1)

        # Upsample conv net
        if upsample_conditional_features:
            self.upsample_conv = nn.ModuleList()
            for s in upsample_scales:
                convt = ConvTranspose1d(
                    cin_channels, cin_channels, kernel_size=s, padding=0,
                    dilation=1, stride=s, std_mul=1.0)
                convt.bias.data.zero_()
                convt.weight.data.fill_(1 / cin_channels)
                self.upsample_conv.append(convt)
                # Is this non-lineality necessary?
                self.upsample_conv.append(nn.ReLU(inplace=True))
        else:
            self.upsample_conv = None

        self.receptive_field = receptive_field_size(layers, stacks, kernel_size)
Esempio n. 2
0
    def __init__(
        self,
        out_channels=256,
        layers=20,
        stacks=2,
        residual_channels=512,
        gate_channels=512,
        skip_out_channels=512,
        kernel_size=3,
        dropout=1 - 0.95,
        cin_channels=-1,
        gin_channels=-1,
        n_speakers=None,
        weight_normalization=True,
        upsample_conditional_features=False,
        upsample_scales=None,
        freq_axis_kernel_size=3,
        scalar_input=False,
    ):
        super(WaveNet, self).__init__()
        self.scalar_input = scalar_input
        self.out_channels = out_channels
        self.cin_channels = cin_channels
        assert layers % stacks == 0
        layers_per_stack = layers // stacks
        if scalar_input:
            self.first_conv = Conv1d1x1(1, residual_channels)
        else:
            self.first_conv = Conv1d1x1(out_channels, residual_channels)

        self.conv_layers = nn.ModuleList()
        for layer in range(layers):
            dilation = 2**(layer % layers_per_stack)
            conv = ResidualConv1dGLU(
                residual_channels,
                gate_channels,
                kernel_size=kernel_size,
                skip_out_channels=skip_out_channels,
                bias=True,  # magenda uses bias, but musyoku doesn't
                dilation=dilation,
                dropout=dropout,
                cin_channels=cin_channels,
                gin_channels=gin_channels,
                weight_normalization=weight_normalization)
            self.conv_layers.append(conv)
        self.last_conv_layers = nn.ModuleList([
            nn.ReLU(inplace=True),
            Conv1d1x1(skip_out_channels,
                      skip_out_channels,
                      weight_normalization=weight_normalization),
            nn.ReLU(inplace=True),
            Conv1d1x1(skip_out_channels,
                      out_channels,
                      weight_normalization=weight_normalization),
        ])

        if gin_channels > 0:
            assert n_speakers is not None
            self.embed_speakers = Embedding(n_speakers,
                                            gin_channels,
                                            padding_idx=None,
                                            std=0.1)
        else:
            self.embed_speakers = None

        # Upsample conv net
        if upsample_conditional_features:
            self.upsample_conv = nn.ModuleList()
            for s in upsample_scales:
                freq_axis_padding = (freq_axis_kernel_size - 1) // 2
                convt = ConvTranspose2d(
                    1,
                    1, (freq_axis_kernel_size, s),
                    padding=(freq_axis_padding, 0),
                    dilation=1,
                    stride=(1, s),
                    weight_normalization=weight_normalization)
                self.upsample_conv.append(convt)
                # assuming we use [0, 1] scaled features
                # this should avoid non-negative upsampling output
                self.upsample_conv.append(nn.ReLU(inplace=True))
        else:
            self.upsample_conv = None

        self.receptive_field = receptive_field_size(layers, stacks,
                                                    kernel_size)
    def __init__(
        self,
        out_channels=2,
        layers=30,
        stacks=3,
        iaf_layer_size=[10, 10, 10, 30],
        # iaf_layer_size=[10, 30],
        residual_channels=64,
        gate_channels=64,
        # skip_out_channels=-1,
        kernel_size=3,
        dropout=1 - 0.95,
        cin_channels=-1,
        gin_channels=-1,
        n_speakers=None,
        weight_normalization=True,
        upsample_conditional_features=False,
        upsample_scales=None,
        freq_axis_kernel_size=3,
        scalar_input=True,
        is_student=True,
    ):
        super(StudentWaveNet, self).__init__()
        self.scalar_input = scalar_input
        self.out_channels = out_channels
        self.cin_channels = cin_channels
        self.is_student = is_student
        self.last_layers = []
        # 噪声
        assert layers % stacks == 0
        layers_per_stack = layers // stacks
        if scalar_input:
            self.first_conv = nn.ModuleList([
                Conv1d1x1(1, residual_channels)
                for _ in range(len(iaf_layer_size))
            ])
        else:
            self.first_conv = nn.ModuleList([
                Conv1d1x1(out_channels, residual_channels)
                for _ in range(len(iaf_layer_size))
            ])
        self.iaf_layers = nn.ModuleList()  # iaf层
        self.last_layers = nn.ModuleList()

        for layer_size in iaf_layer_size:  # build iaf layers -->4 layers by size 10,10,10,30
            # IAF LAYERS
            iaf_layer = nn.ModuleList()
            for layer in range(layer_size):
                dilation = 2**(layer % layers_per_stack)
                conv = ResidualConv1dGLU(
                    residual_channels,
                    gate_channels,
                    kernel_size=kernel_size,
                    bias=True,  # magenda uses bias, but musyoku doesn't
                    dilation=dilation,
                    dropout=dropout,
                    cin_channels=cin_channels,
                    gin_channels=gin_channels,
                    weight_normalization=weight_normalization)
                iaf_layer.append(conv)
            self.iaf_layers.append(iaf_layer)
            self.last_layers.append(
                nn.ModuleList([  # iaf的最后一层
                    nn.ReLU(inplace=True),
                    Conv1d1x1(residual_channels,
                              out_channels,
                              weight_normalization=weight_normalization),
                    # nn.ReLU(inplace=True),
                    # Conv1d1x1(residual_channels, out_channels, weight_normalization=weight_normalization),
                ]))

        if gin_channels > 0:
            assert n_speakers is not None
            self.embed_speakers = Embedding(n_speakers,
                                            gin_channels,
                                            padding_idx=None,
                                            std=0.1)
        else:
            self.embed_speakers = None

        # Upsample conv net
        if upsample_conditional_features:
            self.upsample_conv = nn.ModuleList()
            for s in upsample_scales:
                freq_axis_padding = (freq_axis_kernel_size - 1) // 2
                convt = ConvTranspose2d(
                    1,
                    1, (freq_axis_kernel_size, s),
                    padding=(freq_axis_padding, 0),
                    dilation=1,
                    stride=(1, s),
                    weight_normalization=weight_normalization)
                self.upsample_conv.append(convt)
                # assuming we use [0, 1] scaled features
                # this should avoid non-negative upsampling output
                self.upsample_conv.append(nn.ReLU(inplace=True))
        else:
            self.upsample_conv = None

        self.receptive_field = receptive_field_size(layers, stacks,
                                                    kernel_size)