Beispiel #1
0
    def __init__(self, n_cond, n_res, n_dil, n_skp, stride, dil, filter_sz=2,
            bias=True, parent_rf=None, name=None):
        '''
        filter_sz: # elements in the dilated kernels
        n_cond: # channels of local condition vectors
        n_res : # residual channels
        n_dil : # output channels for dilated kernel
        n_skp : # channels output to skip connections
        '''
        super(GatedResidualCondConv, self).__init__()
        self.conv_signal = nn.Conv1d(n_res, n_dil, filter_sz, dilation=dil, bias=bias)
        self.conv_gate = nn.Conv1d(n_res, n_dil, filter_sz, dilation=dil, bias=bias)
        self.proj_signal = nn.Conv1d(n_cond, n_dil, kernel_size=1, bias=False)
        self.proj_gate = nn.Conv1d(n_cond, n_dil, kernel_size=1, bias=False)
        self.dil_res = nn.Conv1d(n_dil, n_res, kernel_size=1, bias=False)
        self.dil_skp = nn.Conv1d(n_dil, n_skp, kernel_size=1, bias=False)

        # The dilated autoregressive convolution produces an output at the
        # right-most position of the receptive field.  (At the very end of a
        # stack of these, the output corresponds to the position just after
        # this, but within the stack of convolutions, outputs right-aligned.
        dil_filter_sz = (filter_sz - 1) * dil + 1
        self.rf = rfield.Rfield(filter_info=(dil_filter_sz - 1, 0),
                parent=parent_rf, name=name)
        self.beg_rf = None
        self.end_rf = None
        self.apply(netmisc.xavier_init)
Beispiel #2
0
    def __init__(self,
                 n_in_chan,
                 n_out_chan,
                 filter_sz,
                 stride=1,
                 do_res=True,
                 parent_rf=None,
                 name=None):
        super(ConvReLURes, self).__init__()
        self.do_res = do_res
        if self.do_res:
            if stride != 1:
                print('Stride must be 1 for residually connected convolution',
                      file=sys.stderr)
                raise ValueError

        self.n_in = n_in_chan
        self.n_out = n_out_chan
        self.conv = nn.Conv1d(n_in_chan,
                              n_out_chan,
                              filter_sz,
                              stride,
                              padding=0,
                              bias=False)
        self.relu = nn.ReLU()
        # self.bn = nn.BatchNorm1d(n_out_chan)

        self.rf = rfield.Rfield(filter_info=filter_sz,
                                stride=stride,
                                parent=parent_rf,
                                name=name)
        netmisc.xavier_init(self.conv)
Beispiel #3
0
 def __init__(self, sample_rate=16000, win_sz=400, hop_sz=160, n_mels=80,
         n_mfcc=13, name=None):
     self.sample_rate = sample_rate
     self.window_sz = win_sz
     self.hop_sz = hop_sz
     self.n_mels = n_mels
     self.n_mfcc = n_mfcc
     self.n_out = n_mfcc * 3
     self.rf = rfield.Rfield(filter_info=self.window_sz, stride=self.hop_sz,
             parent=None, name=name)
Beispiel #4
0
    def __init__(self, n_chan, filter_sz, stride, parent_rf, name=None):
        super(Upsampling, self).__init__()
        # See upsampling_notes.txt: padding = filter_sz - stride
        # and: left_offset = left_wing_sz - end_padding
        end_padding = stride - 1
        self.rf = rfield.Rfield(filter_info=filter_sz, stride=stride,
                padding=(end_padding, end_padding), is_downsample=False,
                parent=parent_rf, name=name)

        self.tconv = nn.ConvTranspose1d(n_chan, n_chan, filter_sz, stride,
                padding=filter_sz - stride)
Beispiel #5
0
    def __init__(self, filter_sz, n_lc_in, n_lc_out, lc_upsample_filt_sizes,
            lc_upsample_strides, n_res, n_dil, n_skp, n_post, n_quant,
            n_blocks, n_block_layers, jitter_prob, n_speakers, n_global_embed,
            bias=True, parent_rf=None):
        super(WaveNet, self).__init__()

        self.n_blocks = n_blocks
        self.n_block_layers = n_block_layers
        self.n_quant = n_quant
        self.quant_onehot = None 
        self.bias = bias
        self.jitter = Jitter(jitter_prob)
        post_jitter_filt_sz = 3
        lc_input_stepsize = np_prod(lc_upsample_strides) 

        lc_conv_name = 'LC_Conv(filter_size={})'.format(post_jitter_filt_sz) 
        self.lc_conv = Conv1dWrap(n_lc_in, n_lc_out,
                kernel_size=post_jitter_filt_sz, stride=1, bias=self.bias)

        cur_rf = rfield.Rfield(filter_info=post_jitter_filt_sz,
                stride=1, parent=parent_rf, name=lc_conv_name)
        self.beg_rf = cur_rf
        
        # This RF is the first processing of the local conditioning after the
        # Jitter. It is the starting point for the commitment loss aggregation
        self.pre_upsample_rf = cur_rf
        self.lc_upsample = nn.Sequential()

        # WaveNet is a stand-alone model, so parent_rf is None
        # The Autoencoder model in model.py will link parent_rfs together.
        for i, (filt_sz, stride) in enumerate(zip(lc_upsample_filt_sizes,
            lc_upsample_strides)):
            name = 'Upsampling_{}(filter_sz={}, stride={})'.format(i, filt_sz, stride)   
            mod = Upsampling(n_lc_out, filt_sz, stride, cur_rf, name=name)
            self.lc_upsample.add_module(str(i), mod)
            cur_rf = mod.rf

        # This rf describes the bounds of the input wav corresponding to the
        # local conditioning vectors
        self.last_upsample_rf = cur_rf
        self.cond = Conditioning(n_speakers, n_global_embed)
        self.base_layer = Conv1dWrap(n_quant, n_res, kernel_size=1, stride=1,
                dilation=1, bias=self.bias)

        self.conv_layers = nn.ModuleList() 
        n_cond = n_lc_out + n_global_embed

        for b in range(self.n_blocks):
            for bl in range(self.n_block_layers):
                dil = 2**bl
                name = 'GRCC_{},{}(dil={})'.format(b, bl, dil)
                grc = GatedResidualCondConv(n_cond, n_res, n_dil, n_skp, 1,
                        dil, filter_sz, bias, cur_rf, name)
                self.conv_layers.append(grc)
                cur_rf = grc.rf

        self.last_grcc_rf = cur_rf

        # Each module in the stack needs to know the dimensions of
        # the input and output of the overall stack, in order to trim
        # residual connections
        beg_grcc_rf = self.conv_layers[0].rf
        end_grcc_rf = self.conv_layers[-1].rf 
        for mod in self.conv_layers.children():
            mod.init_bound_rfs(beg_grcc_rf, end_grcc_rf)

        self.relu = nn.ReLU()
        self.post1 = Conv1dWrap(n_skp, n_post, 1, bias=bias)
        self.post2 = Conv1dWrap(n_post, n_quant, 1, bias=bias)
        self.logsoftmax = nn.LogSoftmax(1) # (B, Q, N)
        self.rf = cur_rf