loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy']) model.summary() feature_file = sys.argv[1] pcm_file = sys.argv[2] # 16 bit unsigned short PCM samples frame_size = 160 nb_features = 55 nb_used_features = model.nb_used_features feature_chunk_size = 15 pcm_chunk_size = frame_size * feature_chunk_size # u for unquantised, load 16 bit PCM samples and convert to mu-law udata = np.fromfile(pcm_file, dtype='int16') data = lin2ulaw(udata) nb_frames = len(data) // pcm_chunk_size features = np.fromfile(feature_file, dtype='float32') # limit to discrete number of frames data = data[:nb_frames * pcm_chunk_size] udata = udata[:nb_frames * pcm_chunk_size] features = features[:nb_frames * feature_chunk_size * nb_features] # Noise injection: the idea is that the real system is going to be # predicting samples based on previously predicted samples rather than # from the original. Since the previously predicted samples aren't # expected to be so good, I add noise to the training data. Exactly # how the noise is added makes a huge difference
state2 = np.zeros((1, model.rnn_units2), dtype='float32') mem = 0 coef = 0.85 fout = open(out_file, 'wb') skip = order + 1 for c in range(0, nb_frames): cfeat = enc.predict([features[c:c+1, :, :nb_used_features], periods[c:c+1, :, :]]) for fr in range(0, feature_chunk_size): f = c*feature_chunk_size + fr a = features[c, fr, nb_features-order:] for i in range(skip, frame_size): pred = -sum(a*pcm[f*frame_size + i - 1:f*frame_size + i - order-1:-1]) fexc[0, 0, 1] = lin2ulaw(pred) p, state1, state2 = dec.predict([fexc, iexc, cfeat[:, fr:fr+1, :], state1, state2]) #Lower the temperature for voiced frames to reduce noisiness p *= np.power(p, np.maximum(0, 1.5*features[c, fr, 37] - .5)) p = p/(1e-18 + np.sum(p)) #Cut off the tail of the remaining distribution p = np.maximum(p-0.002, 0).astype('float64') p = p/(1e-8 + np.sum(p)) iexc[0, 0, 0] = np.argmax(np.random.multinomial(1, p[0,0,:], 1)) pcm[f*frame_size + i] = pred + ulaw2lin(iexc[0, 0, 0]) fexc[0, 0, 0] = lin2ulaw(pcm[f*frame_size + i]) mem = coef*mem + pcm[f*frame_size + i] #print(mem) np.array([np.round(mem)], dtype='int16').tofile(fout)
def synthesis(args, hparams): model = LPCNet(hparams).cuda() feature_file = args.feature_file; out_file = args.out_file frame_size = hparams.frame_size nb_features = hparams.nb_features features = np.fromfile(feature_file, dtype='float32') features = features.reshape(-1, nb_features) #features = np.resize(features, (-1, nb_features)) #使用resize会导致最后一行数据丢失 nb_frames = 1 feature_chunk_size = features.shape[0] pcm_chunk_size = frame_size * feature_chunk_size features = np.reshape(features, (nb_frames, feature_chunk_size, nb_features)) periods = (.1 + 50*features[:,:,hparams.pitch_idx:hparams.pitch_idx+1]+100).astype('int16') if None == hparams.checkpoint_file or not os.path.isfile(hparams.checkpoint_file): return checkpoint_dict = torch.load(hparams.checkpoint_file) model.load_state_dict(checkpoint_dict['state_dict']) #model_init(model) model.eval() order = 16 pcm = np.zeros((nb_frames * pcm_chunk_size,)) fexc = np.zeros((1, 1, 2), dtype='float32') iexc = np.zeros((1, 1, 1), dtype='int16') state1 = torch.Tensor(np.zeros((1, 1, hparams.rnn_units1), dtype='float32')).cuda() state2 = torch.Tensor(np.zeros((1, 1, hparams.rnn_units2), dtype='float32')).cuda() mem = 0 coef = 0.85 fout = open(out_file, "wb") skip = order + 1 for c in range(0, nb_frames): cfeat = model.encoder(features[c:c+1, :, :nb_features], periods[c:c+1, :, :]) fexc[0, 0, 0] = 128 # 0 mulaw iexc[0, 0, 0] = 128 for fr in range(0, feature_chunk_size): f = c * feature_chunk_size + fr a = features[c, fr, nb_features - order:] for i in range(skip, frame_size): pred = -sum(a*pcm[f*frame_size + i - 1:f*frame_size + i - order-1:-1]) fexc[0, 0, 1] = lin2ulaw(pred) p_tensor, state1, state2 = model.decoder(fexc, iexc, cfeat[:, fr:fr+1, :], state1, state2) p = p_tensor.clone().cpu().detach().numpy() # Lower the temperature for voiced frames to reduce noisiness p *= np.power(p, np.maximum(0, 1.5 * features[c, fr, hparams.pitch_idx+1] - .5)) p = p / (1e-18 + np.sum(p)) # Cut off the tail of the remaining distribution p = np.maximum(p - 0.002, 0).astype('float64') p = p / (1e-8 + np.sum(p)) iexc[0, 0, 0] = np.argmax(np.random.multinomial(1, p[0, 0, :], 1)) pcm[f * frame_size + i] = pred + ulaw2lin(iexc[0, 0, 0]) fexc[0, 0, 0] = lin2ulaw(pcm[f * frame_size + i]) mem = coef * mem + pcm[f * frame_size + i] # print(mem) np.array([np.round(mem)], dtype='int16').tofile(fout) skip = 0