Exemple #1
0
              loss='sparse_categorical_crossentropy',
              metrics=['sparse_categorical_accuracy'])
model.summary()

feature_file = sys.argv[1]
pcm_file = sys.argv[2]  # 16 bit unsigned short PCM samples
frame_size = 160
nb_features = 55
nb_used_features = model.nb_used_features
feature_chunk_size = 15
pcm_chunk_size = frame_size * feature_chunk_size

# u for unquantised, load 16 bit PCM samples and convert to mu-law

udata = np.fromfile(pcm_file, dtype='int16')
data = lin2ulaw(udata)
nb_frames = len(data) // pcm_chunk_size

features = np.fromfile(feature_file, dtype='float32')

# limit to discrete number of frames
data = data[:nb_frames * pcm_chunk_size]
udata = udata[:nb_frames * pcm_chunk_size]
features = features[:nb_frames * feature_chunk_size * nb_features]

# Noise injection: the idea is that the real system is going to be
# predicting samples based on previously predicted samples rather than
# from the original. Since the previously predicted samples aren't
# expected to be so good, I add noise to the training data.  Exactly
# how the noise is added makes a huge difference
Exemple #2
0
state2 = np.zeros((1, model.rnn_units2), dtype='float32')

mem = 0
coef = 0.85

fout = open(out_file, 'wb')

skip = order + 1
for c in range(0, nb_frames):
    cfeat = enc.predict([features[c:c+1, :, :nb_used_features], periods[c:c+1, :, :]])
    for fr in range(0, feature_chunk_size):
        f = c*feature_chunk_size + fr
        a = features[c, fr, nb_features-order:]
        for i in range(skip, frame_size):
            pred = -sum(a*pcm[f*frame_size + i - 1:f*frame_size + i - order-1:-1])
            fexc[0, 0, 1] = lin2ulaw(pred)

            p, state1, state2 = dec.predict([fexc, iexc, cfeat[:, fr:fr+1, :], state1, state2])
            #Lower the temperature for voiced frames to reduce noisiness
            p *= np.power(p, np.maximum(0, 1.5*features[c, fr, 37] - .5))
            p = p/(1e-18 + np.sum(p))
            #Cut off the tail of the remaining distribution
            p = np.maximum(p-0.002, 0).astype('float64')
            p = p/(1e-8 + np.sum(p))

            iexc[0, 0, 0] = np.argmax(np.random.multinomial(1, p[0,0,:], 1))
            pcm[f*frame_size + i] = pred + ulaw2lin(iexc[0, 0, 0])
            fexc[0, 0, 0] = lin2ulaw(pcm[f*frame_size + i])
            mem = coef*mem + pcm[f*frame_size + i]
            #print(mem)
            np.array([np.round(mem)], dtype='int16').tofile(fout)
def synthesis(args, hparams):
    model = LPCNet(hparams).cuda()
    feature_file = args.feature_file;
    out_file = args.out_file

    frame_size = hparams.frame_size
    nb_features = hparams.nb_features

    features = np.fromfile(feature_file, dtype='float32')
    features = features.reshape(-1, nb_features)
    #features = np.resize(features, (-1, nb_features)) #使用resize会导致最后一行数据丢失

    nb_frames = 1

    feature_chunk_size = features.shape[0]
    pcm_chunk_size = frame_size * feature_chunk_size
    features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features))
    periods = (.1 + 50*features[:,:,hparams.pitch_idx:hparams.pitch_idx+1]+100).astype('int16')

    if None == hparams.checkpoint_file or not os.path.isfile(hparams.checkpoint_file):
        return
    checkpoint_dict = torch.load(hparams.checkpoint_file)
    model.load_state_dict(checkpoint_dict['state_dict'])

    #model_init(model)

    model.eval()

    order = 16

    pcm = np.zeros((nb_frames * pcm_chunk_size,))
    fexc = np.zeros((1, 1, 2), dtype='float32')
    iexc = np.zeros((1, 1, 1), dtype='int16')
    state1 = torch.Tensor(np.zeros((1, 1, hparams.rnn_units1), dtype='float32')).cuda()
    state2 = torch.Tensor(np.zeros((1, 1, hparams.rnn_units2), dtype='float32')).cuda()

    mem = 0
    coef = 0.85

    fout = open(out_file, "wb")
    skip = order + 1

    for c in range(0, nb_frames):
        cfeat = model.encoder(features[c:c+1, :, :nb_features], periods[c:c+1, :, :])
        fexc[0, 0, 0] = 128  # 0 mulaw
        iexc[0, 0, 0] = 128
        for fr in range(0, feature_chunk_size):
            f = c * feature_chunk_size + fr
            a = features[c, fr, nb_features - order:]
            for i in range(skip, frame_size):
                pred = -sum(a*pcm[f*frame_size + i - 1:f*frame_size + i - order-1:-1])
                fexc[0, 0, 1] = lin2ulaw(pred)

                p_tensor, state1, state2 = model.decoder(fexc, iexc, cfeat[:, fr:fr+1, :], state1, state2)
                p = p_tensor.clone().cpu().detach().numpy()
                # Lower the temperature for voiced frames to reduce noisiness
                p *= np.power(p, np.maximum(0, 1.5 * features[c, fr, hparams.pitch_idx+1] - .5))
                p = p / (1e-18 + np.sum(p))
                # Cut off the tail of the remaining distribution
                p = np.maximum(p - 0.002, 0).astype('float64')
                p = p / (1e-8 + np.sum(p))

                iexc[0, 0, 0] = np.argmax(np.random.multinomial(1, p[0, 0, :], 1))
                pcm[f * frame_size + i] = pred + ulaw2lin(iexc[0, 0, 0])
                fexc[0, 0, 0] = lin2ulaw(pcm[f * frame_size + i])
                mem = coef * mem + pcm[f * frame_size + i]
                # print(mem)
                np.array([np.round(mem)], dtype='int16').tofile(fout)
            skip = 0