Exemple #1
0
def generate_all(cnt):
    # source : loads audio, saves as spec
    path1 = os.path.join(source_wav_dir, source_wav_list[cnt])
    path2 = os.path.join(source_spec_save_dir,
                         source_wav_list[cnt][:-4] + '.png')
    _ = spec_from_path_to_path(path1, path2)
    spec_src, ratio = get_image(path2, sz, resize_input)
    spec_src = transform_image(spec_src, sz, ic, resize_input)

    # source : loads spec, saves as audio
    path3 = os.path.join(source_wav_save_dir, source_wav_list[cnt])
    spec = cv2.imread(path2)
    stft = mel_to_stft(spectrogram_img_to_mel(spec, threshold), sample_rate,
                       n_fft, n_mels, shrink_size, power)
    wave = griffin_lim(stft, griffin_lim_iter, n_fft, win_length, hop_length,
                       pre_emphasis_rate)
    librosa.output.write_wav(path3, wave, sample_rate, norm=True)

    if (have_target):
        # target : loads audio, saves as spec
        path1 = os.path.join(target_wav_dir, source_wav_list[cnt])
        path2 = os.path.join(target_spec_save_dir,
                             source_wav_list[cnt][:-4] + '.png')
        _ = spec_from_path_to_path(path1, path2)

        # target : loads spec, saves as audio
        path3 = os.path.join(target_wav_save_dir, source_wav_list[cnt])
        spec = cv2.imread(path2)
        stft = mel_to_stft(spectrogram_img_to_mel(spec, threshold),
                           sample_rate, n_fft, n_mels, shrink_size, power)
        wave = griffin_lim(stft, griffin_lim_iter, n_fft, win_length,
                           hop_length, pre_emphasis_rate)
        librosa.output.write_wav(path3, wave, sample_rate, norm=True)

    for i in range(noise_per_image):
        # path to save generated spec
        path3 = os.path.join(out_spec_save_dir,
                             source_wav_list[cnt][:-4] + '-' + str(i) + '.png')
        noise = generate_noise(1, nz, device)
        # generate spec
        out = generate(netG, spec_src, noise, oc, device)
        # save it in the path
        cv2.imwrite(path3, out)

        # read the generated spec
        spec = cv2.imread(path3)
        # changes the size of the generated spec to the size of the spec_src (input)
        spec = cv2.resize(spec, (0, 0), fx=1 / ratio, fy=1)
        # makes it stft, then wave
        stft = mel_to_stft(spectrogram_img_to_mel(spec, threshold),
                           sample_rate, n_fft, n_mels, shrink_size, power)
        wave = griffin_lim(stft, griffin_lim_iter, n_fft, win_length,
                           hop_length, pre_emphasis_rate)
        # saves the wave
        path4 = os.path.join(out_wav_save_dir,
                             source_wav_list[cnt][:-4] + '-' + str(i) + '.wav')
        librosa.output.write_wav(path4, wave, sample_rate, norm=True)
Exemple #2
0
                    #byh_ = generate_spec_img(bridhy, is_stft=True)

                    report_i = OrderedDict([
                        ("Origin_{}_{}".format(cnt, c), orig_),
                        #("Gen_{}".format(cnt), gen_),
                        ("Hybrid_{}_{}".format(cnt, c), hyb_),
                        #("Bridhy_{}".format(cnt), byh_),
                        ("NP_{}_{}".format(cnt, c), nop_),
                    ])

                    orig = generate_audio(orig, sr=sr, hop_length=512, is_stft=True)
                    #gen = generate_audio(gen, sr=8000, hop_length=512, is_stft=True)
                    hyb = generate_audio(hybrid, sr=sr, hop_length=512, is_stft=True)
                    #bhy = generate_audio(bridhy, sr=8000, hop_length=512, is_stft=True)
                    nop = generate_audio(no_phase, sr=sr, hop_length=512, is_stft=True)
                    lim, _, _ = griffin_lim(no_phase, n_fft=2048, hop_length=512, n_iter=250)

                    mse = np.sqrt((orig - hyb)**2)
                    nmse = np.sqrt((orig - nop)**2)
                    lmse = np.sqrt((orig - lim)**2)
                    mses.extend(mse)
                    nop_mses.extend(nmse)
                    lim_mses.extend(lmse)

                    report_a = OrderedDict([
                        ("wav_Origin_{}_{}".format(cnt, c), orig),
                        #("wav_Gen_{}".format(cnt), gen),
                        ("wav_Hyb_{}_{}".format(cnt, c), hyb),
                        #("wav_Bhy_{}".format(cnt), bhy),
                        ("wav_Nop_{}_{}".format(cnt, c), nop),
                        ("wav_GLim_{}_{}".format(cnt, c), lim),
threshold = 5
griffin_lim_iter = 100

sz, ic, oc, use_bn, norm_type = 256, 1, 1, True, 'instancenorm'
netG = UNet_G(ic, oc, sz, nz, use_bn, norm_type).to(device)
# netG = ResNet_G(ic, oc, sz, nz = nz, norm_type = norm_type).to(device)
netG.load_state_dict(torch.load(model_path, map_location = 'cpu'))
netG.eval()

out_cnt = 0
for cnt in range(len(input_wav_dir)):
	y = read_audio(os.path.join(input_wav_dir, input_wav_list[cnt]), sample_rate, pre_emphasis_rate)
	mel = get_mel(get_stft(y, n_fft, win_length, hop_length), sample_rate, n_fft, n_mels, power, shrink_size)
	spec = mel_to_spectrogram(mel, threshold, os.path.join(input_spec_save_dir, input_wav_list[cnt][:-4]+'.png'))

	image, ratio = get_image(os.path.join(input_spec_save_dir, input_wav_list[cnt][:-4]+'.png'), sz)
	image = transform_image(image, sz, ic)

	for i in range(noise_per_image):
		noise = generate_noise(1, nz, device)
		out = generate(netG, image, noise, oc, sz, device)
		cv2.imwrite(os.path.join(output_spec_save_dir, input_wav_list[cnt][:-4] + '-' + str(i) + '.png'), out)

		spec = cv2.imread(os.path.join(output_spec_save_dir, input_wav_list[cnt][:-4] + '-' + str(i) + '.png'))
		spec = cv2.resize(spec, (0, 0), fx = 1/ratio, fy = 1)
		stft = mel_to_stft(spectrogram_img_to_mel(spec, threshold), sample_rate, n_fft, n_mels, shrink_size, power)
		wave = griffin_lim(stft, griffin_lim_iter, n_fft, win_length, hop_length, pre_emphasis_rate)

		librosa.output.write_wav(os.path.join(output_wav_dir, input_wav_list[cnt][:-4] + '-' + str(i) + '.wav'), wave, sample_rate, norm = True)

Exemple #4
0
import soundfile
from scipy.io.wavfile import read
from utils import griffin_lim

if __name__ == '__main__':
    filename = 'audios/007064.wav'
    # assume 1 channel wav file
    sr, data = read(filename)
    data = data.astype(np.float16)

    # print(data.dtype)

    # 由 STFT -> STFT magnitude
    stftm_matrix = np.abs(librosa.core.stft(data))
    # + random 模拟 modification
    stftm_matrix_modified = stftm_matrix + np.random.random(stftm_matrix.shape)
    # stftm_matrix_modified = stftm_matrix

    # Griffin-Lim 估计音频信号
    y_iters = griffin_lim(stftm_matrix_modified, data.shape)
    y = y_iters[0][0]

    # y = _griffin_lim(stftm_matrix_modified)

    print(y)

    plt.plot(y)
    plt.show()

    soundfile.write('out.wav', y, sr)
Exemple #5
0
y = record_audio()
y = librosa.util.normalize(y, norm=np.inf, axis=None)
print(np.max(y), np.min(y))
mel = get_mel(get_stft(y, 2048, 1000, 250), 22050, 2048, 256, 1, 1)
spec = mel_to_spectrogram(mel, 5, None)

print('Model Inference Start')
spec_t = transform_image(spec, 256, ic, resize_input=False)
out_spec = generate(netG, spec_t, noise, oc, device)
out_spec = out_spec.reshape(out_spec.shape[0], out_spec.shape[1])
print('Model Inference End')

print('Griffin Lim Process Start')
out_mel = spectrogram_img_to_mel(out_spec, 5, gray=True)
out_stft = mel_to_stft(out_mel, 22050, 2048, 256, 1, 1)
out = griffin_lim(out_stft, 300, 2048, 1000, 250, None)
out = librosa.util.normalize(out, norm=np.inf, axis=None)
print('Griffin Lim Process End')

out = out[500:]
play_audio(out.reshape(-1, 1), 22050)
print('-' * 8, 'Process complete.')

cv2.namedWindow('Input')
cv2.namedWindow('Output')

while True:
    cv2.imshow('Input', spec)
    cv2.imshow('Output', out_spec)

    key = cv2.waitKey(1) & 0xFF
Exemple #6
0
    d = d.unsqueeze(0)
    start = time.time()
    pred = model.forward(Variable(d[:, 0], volatile=True).cuda(args.gpu))
    mag = d.cpu().numpy()[0]
    pred = pred.data.cpu().numpy()[0, :1024, ...]
    stft = (np.exp(mag[0]) - 1) * np.exp(pred * 1.j)
    audio = generate_audio(stft, sr=args.sr, hop_length=args.hop, is_stft=True)
    end = time.time() - start
    runtimes.append(end)
    write_wav("demo/unet_{}_{}.wav".format(args.genre, c), audio, sr=args.sr)

print("UNet - avg {} sec per clip.".format(np.mean(runtimes)))

# Griffin-Lim
runtimes = []

for c, d in enumerate(data):
    d = d.unsqueeze(0)
    start = time.time()
    mag = d.cpu().numpy()[0]
    mag = np.exp(mag[0]) - 1
    lim, _, _ = griffin_lim(mag,
                            n_fft=args.n_fft,
                            hop_length=args.hop,
                            n_iter=250)
    end = time.time() - start
    runtimes.append(end)
    write_wav("demo/gl_{}_{}.wav".format(args.genre, c), lim, sr=args.sr)

print("GL - avg {} sec per clip".format(np.mean(runtimes)))
Exemple #7
0
import utils as mgu
import numpy as np
from scipy.io.wavfile import read
import math

# test 1
a = read('/Users/ihsan/GitHub/mfcc/speech.wav')

# native loadwav
# a = mgu.load_wav('/Users/ihsan/GitHub/mfcc/speech.wav')

# convert audio file into np.array?
audio = np.array(a[1], dtype=float)

# preparam (from MATLAB)
frameLen = 25
frameShiftMS = 10

# parameter setup
mag = mgu.specgram(audio)  # y?
phase_angle = 0
n_fft = pow(2, int(math.ceil(math.log(frameLen) / math.log(2))))
hop = frameShiftMS
num_iters = 50

# griffin_lim function
mgu.griffin_lim(mag, phase_angle, n_fft, hop, num_iters)