Esempio n. 1
0
    def _init_geometry(self, batch_win_size):
        """
        Initializes:
        self.enc_in_len
        self.trim_ups_out
        self.trim_dec_out
        self.trim_dec_in
        """
        # Calculate max length of mfcc encoder input and wav decoder input
        w = batch_win_size
        mfcc_vc = self.encoder.vc['beg'].parent
        beg_grcc_vc = self.decoder.vc['beg_grcc']
        end_grcc_vc = self.decoder.vc['end_grcc']
        end_ups_vc = self.decoder.vc['last_upsample']
        end_enc_vc = self.encoder.vc['end']

        do = vconv.GridRange((0, 100000), (0, w), 1)
        di = vconv.input_range(beg_grcc_vc, end_grcc_vc, do)
        ei = vconv.input_range(mfcc_vc, end_grcc_vc, do)
        mi = vconv.input_range(mfcc_vc.child, end_grcc_vc, do)
        eo = vconv.output_range(mfcc_vc, end_enc_vc, ei)
        uo = vconv.output_range(mfcc_vc, end_ups_vc, ei)

        # Needed for trimming various tensors
        self.enc_in_len = ei.sub_length()
        self.enc_in_mel_len = mi.sub_length()
        self.embed_len = eo.sub_length() 
        self.dec_in_len = di.sub_length()
        self.trim_dec_in = torch.tensor([di.sub[0] - ei.sub[0], di.sub[1] -
            ei.sub[0]], dtype=torch.long)
        self.decoder.trim_ups_out = torch.tensor([di.sub[0] - uo.sub[0],
            di.sub[1] - uo.sub[0]], dtype=torch.long)
        self.trim_dec_out = torch.tensor([do.sub[0] - di.sub[0], do.sub[1] -
            di.sub[0]], dtype=torch.long)
Esempio n. 2
0
    def _init_geometry(self, batch_win_size):
        """
        Initializes lengths and trimming needed to produce batch_win_size
        output
        
        self.enc_in_len - encoder input length (timesteps)
        self.dec_in_len - decoder input length (timesteps)
        self.trim_ups_out - trims decoder lc_dense before use  
        self.trim_dec_out - trims wav_dec_input to wav_dec_output
        self.trim_dec_in  - trims wav_enc_input to wav_dec_input

        The trimming vectors are needed because, due to striding geometry,
        output tensors cannot be produced in single-increment sizes, therefore
        must be over-produced in some cases.
        """
        # Calculate max length of mfcc encoder input and wav decoder input
        w = batch_win_size
        mfcc_vc = self.encoder.vc['beg'].parent
        end_enc_vc = self.encoder.vc['end']
        end_ups_vc = self.decoder.vc['last_upsample']
        beg_grcc_vc = self.decoder.vc['beg_grcc']
        end_grcc_vc = self.decoder.vc['end_grcc']

        # naming: (d: decoder, e: encoder, u: upsample), (o: output, i:input)
        do = vconv.GridRange((0, 100000), (0, w), 1)
        di = vconv.input_range(beg_grcc_vc, end_grcc_vc, do)
        ei = vconv.input_range(mfcc_vc, end_grcc_vc, do)
        mi = vconv.input_range(mfcc_vc.child, end_grcc_vc, do)
        eo = vconv.output_range(mfcc_vc, end_enc_vc, ei)
        uo = vconv.output_range(mfcc_vc, end_ups_vc, ei)

        # Needed for trimming various tensors
        self.enc_in_len = ei.sub_length()
        self.enc_in_mel_len = mi.sub_length()
        # used by jitter_index
        self.embed_len = eo.sub_length()

        # sets size for wav_dec_in
        self.dec_in_len = di.sub_length()

        # trims wav_enc_input to wav_dec_input
        self.trim_dec_in = torch.tensor(
            [di.sub[0] - ei.sub[0], di.sub[1] - ei.sub[0]], dtype=torch.long)

        # needed by wavenet to trim upsampled local conditioning tensor
        self.decoder.trim_ups_out = torch.tensor(
            [di.sub[0] - uo.sub[0], di.sub[1] - uo.sub[0]], dtype=torch.long)

        #
        self.trim_dec_out = torch.tensor(
            [do.sub[0] - di.sub[0], do.sub[1] - di.sub[0]], dtype=torch.long)
Esempio n. 3
0
def autoenc_test(vcs, in_len, slice_beg):
    enc = vcs['MFCC'], vcs['Upsampling_3']
    dec = vcs['GRCC_0,0'], vcs['GRCC_1,9']
    mfcc = vcs['MFCC'], vcs['MFCC']
    autoenc = vcs['MFCC'], vcs['GRCC_1,9']

    full_in = vconv.GridRange((0, in_len), (0, in_len), 1)
    full_mfcc = vconv.output_range(*mfcc, full_in)
    full_out = vconv.output_range(*autoenc, full_in)

    out_req = vconv.GridRange(full_out.full, (slice_beg, slice_beg + 100), 1)
    mid_req = vconv.input_range(*dec, out_req)
    in_req = vconv.input_range(*enc, mid_req)
    in_act = in_req
    mfcc_act = vconv.output_range(*mfcc, in_act)
    mid_act = vconv.output_range(*enc, in_act)

    # wav -> wav_mid 
    wav_mid_sl = vconv.tensor_slice(in_act, mid_req.sub)
    # wav_mid_ten = wav_ten[wav_mid_sl]

    # lcond -> lcond_sl
    lcond_sl = vconv.tensor_slice(mid_act, mid_req.sub)
    # lcond_sl_ten = lcond_ten[lcond_sl]
    
    # wav -> wav_out 
    # +1 since it is predicting the next step
    wav_out_sl = vconv.tensor_slice(in_act, out_req.sub)
    # wav_out_ten = wav_ten[sl_b+1:sl_e+1]

    mfcc_in_sl = vconv.tensor_slice(full_mfcc, mfcc_act.sub)

    print('{:10}: {}'.format('full_in', full_in))
    print('{:10}: {}'.format('full_mfcc', full_mfcc))
    print('{:10}: {}'.format('in_req', in_req))
    print('{:10}: {}'.format('mfcc_req', mfcc_act))
    print('{:10}: {}'.format('mid_req', mid_req))
    print('{:10}: {}'.format('mid_act', mid_act))
    print('{:10}: {}'.format('out_req', out_req))
    print('{:10}: {}'.format('full_out', full_out))

    print('wav_mid_sl: {}  len: {}'.format(wav_mid_sl, wav_mid_sl[1] -
        wav_mid_sl[0]))
    print('mfcc_in_sl: {}  len: {}'.format(mfcc_in_sl, mfcc_in_sl[1] -
        mfcc_in_sl[0]))
    print('lcond_sl: {}  len: {}'.format(lcond_sl, lcond_sl[1] - lcond_sl[0]))
    print('wav_out_sl: {}  len: {}'.format(wav_out_sl, wav_out_sl[1] - wav_out_sl[0]))
Esempio n. 4
0
    def _init_geometry(self, batch_win_size):
        """
        Initializes:
        self.enc_in_len - timesteps of encoder input needed to
                          produce batch_win_size decoder output timesteps
        self.trim_ups_out - offsets for trimming the upsampler output tensor
        self.trim_dec_out - offsets for trimming the decoder output
        self.trim_dec_in  - offsets for trimming the decoder input

        The trimming vectors are needed because, due to striding geometry,
        output tensors cannot be produced in single-increment sizes, therefore
        must be over-produced in some cases.
        """
        # Calculate max length of mfcc encoder input and wav decoder input
        w = batch_win_size
        mfcc_vc = self.encoder.vc['beg'].parent
        beg_grcc_vc = self.decoder.vc['beg_grcc']
        end_grcc_vc = self.decoder.vc['end_grcc']
        end_ups_vc = self.decoder.vc['last_upsample']
        end_enc_vc = self.encoder.vc['end']

        # naming: (d: decoder, e: encoder, u: upsample), (o: output, i:input)
        do = vconv.GridRange((0, 100000), (0, w), 1)
        di = vconv.input_range(beg_grcc_vc, end_grcc_vc, do)
        ei = vconv.input_range(mfcc_vc, end_grcc_vc, do)
        mi = vconv.input_range(mfcc_vc.child, end_grcc_vc, do)
        eo = vconv.output_range(mfcc_vc, end_enc_vc, ei)
        uo = vconv.output_range(mfcc_vc, end_ups_vc, ei)

        # Needed for trimming various tensors
        self.enc_in_len = ei.sub_length()
        self.enc_in_mel_len = mi.sub_length()
        self.embed_len = eo.sub_length()
        self.dec_in_len = di.sub_length()
        self.trim_dec_in = torch.tensor(
            [di.sub[0] - ei.sub[0], di.sub[1] - ei.sub[0]], dtype=torch.long)
        self.decoder.trim_ups_out = torch.tensor(
            [di.sub[0] - uo.sub[0], di.sub[1] - uo.sub[0]], dtype=torch.long)
        self.trim_dec_out = torch.tensor(
            [do.sub[0] - di.sub[0], do.sub[1] - di.sub[0]], dtype=torch.long)
Esempio n. 5
0
def downsample_test(vc, x):
    try:
        y = vconv.output_range(vc, vc, x)
    except RuntimeError:
        return Result.NO_OUTPUT
    try:
        xn = vconv.input_range(vc, vc, y)
    except RuntimeError:
        return Result.NO_INPUT

    try:
        yt = vconv.output_range(vc, vc, xn)
    except RuntimeError:
        return Result.NO_OUTPUT
    try:
        xt = vconv.input_range(vc, vc, yt)
    except RuntimeError:
        return Result.NO_INPUT

    if xn != xt:
        return Result.UNEQUAL
    else:
        return Result.SUCCESS
Esempio n. 6
0
def same_or_upsample_test(vc, x):
    try:
        y = vconv.output_range(vc, vc, x)
    except RuntimeError:
        return Result.NO_OUTPUT
    try:
        xn = vconv.input_range(vc, vc, y)
    except RuntimeError:
        return Result.NO_INPUT

    if xn != x:
        return Result.UNEQUAL
    else:
        return Result.SUCCESS
Esempio n. 7
0
    def sample(self, wav_onehot, lc_sparse, speaker_inds, jitter_index, n_rep):
        """
        Generate n_rep samples, using lc_sparse and speaker_inds for local and global
        conditioning.  

        wav_onehot: full length wav vector
        lc_sparse: full length local conditioning vector derived from full
        wav_onehot
        """
        # initialize model geometry
        mfcc_vc = self.vc['beg'].parent
        up_vc = self.vc['pre_upsample'].child
        beg_grcc_vc = self.vc['beg_grcc']
        end_vc = self.vc['end_grcc']

        # calculate full output range
        wav_gr = vconv.GridRange((0, 1e12), (0, wav_onehot.size()[2]), 1)
        full_out_gr = vconv.output_range(mfcc_vc, end_vc, wav_gr)
        n_ts = full_out_gr.sub_length()

        # calculate starting input range for single timestep
        one_gr = vconv.GridRange((0, 1e12), (0, 1), 1)
        vconv.compute_inputs(end_vc, one_gr)

        # calculate starting position of wav
        wav_beg = int(beg_grcc_vc.input_gr.sub[0] - mfcc_vc.input_gr.sub[0])
        # wav_end = int(beg_grcc_vc.input_gr.sub[1] - mfcc_vc.input_gr.sub[0])
        wav_onehot = wav_onehot[:,:,wav_beg:]

        # !!! hack - I'm not sure why the int() cast is necessary
        n_init_ts = int(beg_grcc_vc.in_len())

        lc_sparse = lc_sparse.repeat(n_rep, 1, 1)
        jitter_index = jitter_index.repeat(n_rep, 1)
        speaker_inds = speaker_inds.repeat(n_rep)

        # precalculate conditioning vector for all timesteps
        D1 = lc_sparse.size()[1]
        lc_jitter = torch.take(lc_sparse,
                jitter_index.unsqueeze(1).expand(-1, D1, -1))
        lc_conv = self.lc_conv(lc_jitter) 
        lc_dense = self.lc_upsample(lc_conv)
        cond = self.cond(lc_dense, speaker_inds)
        n_ts = cond.size()[2]

        
        # cond_loff, cond_roff = vconv.output_offsets(mfcc_vc, up_end_vc)

        # zero out  
        start_pos = 26000
        n_samples = 20000
        end_pos = start_pos + n_samples

        # wav_onehot[...,n_init_ts:] = 0
        wav_onehot = wav_onehot.repeat(n_rep, 1, 1)
        # wav_onehot[...,start_pos:end_pos] = 0

        # assert cond.size()[2] == wav_onehot.size()[2]

        # loop through timesteps
        # inrange = torch.tensor((0, n_init_ts), dtype=torch.int32)
        inrange = torch.tensor((start_pos - n_init_ts, start_pos), dtype=torch.int32)
        # end_ind = torch.tensor([n_ts], dtype=torch.int32)
        end_ind = torch.tensor([end_pos], dtype=torch.int32)

        # inefficient - this recalculates intermediate activations for the
        # entire receptive fields, rather than just the advancing front
        while not torch.equal(inrange[1], end_ind[0]):
        # while inrange[1] != end_ind[0]:
            sig = self.base_layer(wav_onehot[:,:,inrange[0]:inrange[1]]) 
            sig, skp_sum = self.conv_layers[0](sig, cond[:,:,inrange[0]:inrange[1]])
            for layer in self.conv_layers[1:]:
                sig, skp = layer(sig, cond[:,:,inrange[0]:inrange[1]])
                skp_sum += skp

            post1 = self.post1(self.relu(skp_sum))
            quant = self.post2(self.relu(post1))
            cat = dcat.OneHotCategorical(logits=quant.squeeze(2))
            wav_onehot[1:,:,inrange[1]] = cat.sample()[1:,...]
            inrange += 1
            if inrange[0] % 100 == 0:
                print(inrange, end_ind[0])

        
        # convert to value format
        quant_range = wav_onehot.new(list(range(self.n_quant)))
        wav = torch.matmul(wav_onehot.permute(0,2,1), quant_range)
        torch.set_printoptions(threshold=100000)
        pad = 5
        print('padding = {}'.format(pad))
        print('original')
        print(wav[0,start_pos-pad:end_pos+pad])
        print('synth')
        print(wav[1,start_pos-pad:end_pos+pad])

        # print(wav[:,end_pos:end_pos + 10000])
        print('synth range std: {}, baseline std: {}'.format(
            wav[:,start_pos:end_pos].std(), wav[:,end_pos:].std()
            ))

        return wav
Esempio n. 8
0
    for vc, x in input_gen(t): 
        if vc.stride_ratio.numerator > 1: 
            res = downsample_test(vc, x)
        else:
            res = same_or_upsample_test(vc, x)
        results[res] += 1
        if c > 0 and c % t.report_freq == 0:
            print(results)
        c += 1

    print('Finished')
    print('Results: {}'.format(results))


x = vconv.GridRange((0, 250000), (0, 250000), 1)
y = vconv.output_range(vcs['MFCC'], vcs['GRCC_1,9'], x)
xi = vconv.input_range(vcs['MFCC'], vcs['GRCC_1,9'], y)

#print('x0: {}'.format(x))
#print('y0: {}'.format(y))
#print('xi: {}'.format(xi))


def autoenc_test(vcs, in_len, slice_beg):
    enc = vcs['MFCC'], vcs['Upsampling_3']
    dec = vcs['GRCC_0,0'], vcs['GRCC_1,9']
    mfcc = vcs['MFCC'], vcs['MFCC']
    autoenc = vcs['MFCC'], vcs['GRCC_1,9']

    full_in = vconv.GridRange((0, in_len), (0, in_len), 1)
    full_mfcc = vconv.output_range(*mfcc, full_in)