def generate_all(cnt): # source : loads audio, saves as spec path1 = os.path.join(source_wav_dir, source_wav_list[cnt]) path2 = os.path.join(source_spec_save_dir, source_wav_list[cnt][:-4] + '.png') _ = spec_from_path_to_path(path1, path2) spec_src, ratio = get_image(path2, sz, resize_input) spec_src = transform_image(spec_src, sz, ic, resize_input) # source : loads spec, saves as audio path3 = os.path.join(source_wav_save_dir, source_wav_list[cnt]) spec = cv2.imread(path2) stft = mel_to_stft(spectrogram_img_to_mel(spec, threshold), sample_rate, n_fft, n_mels, shrink_size, power) wave = griffin_lim(stft, griffin_lim_iter, n_fft, win_length, hop_length, pre_emphasis_rate) librosa.output.write_wav(path3, wave, sample_rate, norm=True) if (have_target): # target : loads audio, saves as spec path1 = os.path.join(target_wav_dir, source_wav_list[cnt]) path2 = os.path.join(target_spec_save_dir, source_wav_list[cnt][:-4] + '.png') _ = spec_from_path_to_path(path1, path2) # target : loads spec, saves as audio path3 = os.path.join(target_wav_save_dir, source_wav_list[cnt]) spec = cv2.imread(path2) stft = mel_to_stft(spectrogram_img_to_mel(spec, threshold), sample_rate, n_fft, n_mels, shrink_size, power) wave = griffin_lim(stft, griffin_lim_iter, n_fft, win_length, hop_length, pre_emphasis_rate) librosa.output.write_wav(path3, wave, sample_rate, norm=True) for i in range(noise_per_image): # path to save generated spec path3 = os.path.join(out_spec_save_dir, source_wav_list[cnt][:-4] + '-' + str(i) + '.png') noise = generate_noise(1, nz, device) # generate spec out = generate(netG, spec_src, noise, oc, device) # save it in the path cv2.imwrite(path3, out) # read the generated spec spec = cv2.imread(path3) # changes the size of the generated spec to the size of the spec_src (input) spec = cv2.resize(spec, (0, 0), fx=1 / ratio, fy=1) # makes it stft, then wave stft = mel_to_stft(spectrogram_img_to_mel(spec, threshold), sample_rate, n_fft, n_mels, shrink_size, power) wave = griffin_lim(stft, griffin_lim_iter, n_fft, win_length, hop_length, pre_emphasis_rate) # saves the wave path4 = os.path.join(out_wav_save_dir, source_wav_list[cnt][:-4] + '-' + str(i) + '.wav') librosa.output.write_wav(path4, wave, sample_rate, norm=True)
#byh_ = generate_spec_img(bridhy, is_stft=True) report_i = OrderedDict([ ("Origin_{}_{}".format(cnt, c), orig_), #("Gen_{}".format(cnt), gen_), ("Hybrid_{}_{}".format(cnt, c), hyb_), #("Bridhy_{}".format(cnt), byh_), ("NP_{}_{}".format(cnt, c), nop_), ]) orig = generate_audio(orig, sr=sr, hop_length=512, is_stft=True) #gen = generate_audio(gen, sr=8000, hop_length=512, is_stft=True) hyb = generate_audio(hybrid, sr=sr, hop_length=512, is_stft=True) #bhy = generate_audio(bridhy, sr=8000, hop_length=512, is_stft=True) nop = generate_audio(no_phase, sr=sr, hop_length=512, is_stft=True) lim, _, _ = griffin_lim(no_phase, n_fft=2048, hop_length=512, n_iter=250) mse = np.sqrt((orig - hyb)**2) nmse = np.sqrt((orig - nop)**2) lmse = np.sqrt((orig - lim)**2) mses.extend(mse) nop_mses.extend(nmse) lim_mses.extend(lmse) report_a = OrderedDict([ ("wav_Origin_{}_{}".format(cnt, c), orig), #("wav_Gen_{}".format(cnt), gen), ("wav_Hyb_{}_{}".format(cnt, c), hyb), #("wav_Bhy_{}".format(cnt), bhy), ("wav_Nop_{}_{}".format(cnt, c), nop), ("wav_GLim_{}_{}".format(cnt, c), lim),
threshold = 5 griffin_lim_iter = 100 sz, ic, oc, use_bn, norm_type = 256, 1, 1, True, 'instancenorm' netG = UNet_G(ic, oc, sz, nz, use_bn, norm_type).to(device) # netG = ResNet_G(ic, oc, sz, nz = nz, norm_type = norm_type).to(device) netG.load_state_dict(torch.load(model_path, map_location = 'cpu')) netG.eval() out_cnt = 0 for cnt in range(len(input_wav_dir)): y = read_audio(os.path.join(input_wav_dir, input_wav_list[cnt]), sample_rate, pre_emphasis_rate) mel = get_mel(get_stft(y, n_fft, win_length, hop_length), sample_rate, n_fft, n_mels, power, shrink_size) spec = mel_to_spectrogram(mel, threshold, os.path.join(input_spec_save_dir, input_wav_list[cnt][:-4]+'.png')) image, ratio = get_image(os.path.join(input_spec_save_dir, input_wav_list[cnt][:-4]+'.png'), sz) image = transform_image(image, sz, ic) for i in range(noise_per_image): noise = generate_noise(1, nz, device) out = generate(netG, image, noise, oc, sz, device) cv2.imwrite(os.path.join(output_spec_save_dir, input_wav_list[cnt][:-4] + '-' + str(i) + '.png'), out) spec = cv2.imread(os.path.join(output_spec_save_dir, input_wav_list[cnt][:-4] + '-' + str(i) + '.png')) spec = cv2.resize(spec, (0, 0), fx = 1/ratio, fy = 1) stft = mel_to_stft(spectrogram_img_to_mel(spec, threshold), sample_rate, n_fft, n_mels, shrink_size, power) wave = griffin_lim(stft, griffin_lim_iter, n_fft, win_length, hop_length, pre_emphasis_rate) librosa.output.write_wav(os.path.join(output_wav_dir, input_wav_list[cnt][:-4] + '-' + str(i) + '.wav'), wave, sample_rate, norm = True)
import soundfile from scipy.io.wavfile import read from utils import griffin_lim if __name__ == '__main__': filename = 'audios/007064.wav' # assume 1 channel wav file sr, data = read(filename) data = data.astype(np.float16) # print(data.dtype) # 由 STFT -> STFT magnitude stftm_matrix = np.abs(librosa.core.stft(data)) # + random 模拟 modification stftm_matrix_modified = stftm_matrix + np.random.random(stftm_matrix.shape) # stftm_matrix_modified = stftm_matrix # Griffin-Lim 估计音频信号 y_iters = griffin_lim(stftm_matrix_modified, data.shape) y = y_iters[0][0] # y = _griffin_lim(stftm_matrix_modified) print(y) plt.plot(y) plt.show() soundfile.write('out.wav', y, sr)
y = record_audio() y = librosa.util.normalize(y, norm=np.inf, axis=None) print(np.max(y), np.min(y)) mel = get_mel(get_stft(y, 2048, 1000, 250), 22050, 2048, 256, 1, 1) spec = mel_to_spectrogram(mel, 5, None) print('Model Inference Start') spec_t = transform_image(spec, 256, ic, resize_input=False) out_spec = generate(netG, spec_t, noise, oc, device) out_spec = out_spec.reshape(out_spec.shape[0], out_spec.shape[1]) print('Model Inference End') print('Griffin Lim Process Start') out_mel = spectrogram_img_to_mel(out_spec, 5, gray=True) out_stft = mel_to_stft(out_mel, 22050, 2048, 256, 1, 1) out = griffin_lim(out_stft, 300, 2048, 1000, 250, None) out = librosa.util.normalize(out, norm=np.inf, axis=None) print('Griffin Lim Process End') out = out[500:] play_audio(out.reshape(-1, 1), 22050) print('-' * 8, 'Process complete.') cv2.namedWindow('Input') cv2.namedWindow('Output') while True: cv2.imshow('Input', spec) cv2.imshow('Output', out_spec) key = cv2.waitKey(1) & 0xFF
d = d.unsqueeze(0) start = time.time() pred = model.forward(Variable(d[:, 0], volatile=True).cuda(args.gpu)) mag = d.cpu().numpy()[0] pred = pred.data.cpu().numpy()[0, :1024, ...] stft = (np.exp(mag[0]) - 1) * np.exp(pred * 1.j) audio = generate_audio(stft, sr=args.sr, hop_length=args.hop, is_stft=True) end = time.time() - start runtimes.append(end) write_wav("demo/unet_{}_{}.wav".format(args.genre, c), audio, sr=args.sr) print("UNet - avg {} sec per clip.".format(np.mean(runtimes))) # Griffin-Lim runtimes = [] for c, d in enumerate(data): d = d.unsqueeze(0) start = time.time() mag = d.cpu().numpy()[0] mag = np.exp(mag[0]) - 1 lim, _, _ = griffin_lim(mag, n_fft=args.n_fft, hop_length=args.hop, n_iter=250) end = time.time() - start runtimes.append(end) write_wav("demo/gl_{}_{}.wav".format(args.genre, c), lim, sr=args.sr) print("GL - avg {} sec per clip".format(np.mean(runtimes)))
import utils as mgu import numpy as np from scipy.io.wavfile import read import math # test 1 a = read('/Users/ihsan/GitHub/mfcc/speech.wav') # native loadwav # a = mgu.load_wav('/Users/ihsan/GitHub/mfcc/speech.wav') # convert audio file into np.array? audio = np.array(a[1], dtype=float) # preparam (from MATLAB) frameLen = 25 frameShiftMS = 10 # parameter setup mag = mgu.specgram(audio) # y? phase_angle = 0 n_fft = pow(2, int(math.ceil(math.log(frameLen) / math.log(2)))) hop = frameShiftMS num_iters = 50 # griffin_lim function mgu.griffin_lim(mag, phase_angle, n_fft, hop, num_iters)