def __init__(self, model_dir): self.model_dir = model_dir sys.path.append(model_dir) extractor = import_module('extractor') self.extractor, self.epoch = misc.load_latest(model_dir, 'extractor') self.autoenc, _ = misc.load_latest(model_dir, 'autoencoder') self.extractor_args = self.extractor.args() use_cuda = not self.extractor_args.no_cuda and torch.cuda.is_available( ) self.device = torch.device("cuda" if use_cuda else "cpu") self.extractor.to(self.device) self.autoenc.to(self.device) self.kwargs = { 'num_workers': 1, 'pin_memory': True } if use_cuda else {} self.extractor_optimizer = optim.Adam(self.extractor.parameters(), lr=self.extractor_args.lr, betas=(0.9, 0.99), eps=1e-5, weight_decay=0, amsgrad=False) self.best_eval_loss = np.inf self.best_eval_loss_latent = np.inf if self.epoch != 0: self.extractor_optimizer.load_state_dict( torch.load( os.path.join(model_dir, '{}_opt_extractor.pth'.format(self.epoch)))) for g in self.extractor_optimizer.param_groups: g['lr'] = self.extractor_args.lr data_transform = self.extractor.data_transform() target_transform = self.extractor.target_transform() train_db = RirnetDatabase(is_training=True, args=self.extractor_args, data_transform=data_transform, target_transform=target_transform) eval_db = RirnetDatabase(is_training=False, args=self.extractor_args, data_transform=data_transform, target_transform=target_transform) self.train_loader = torch.utils.data.DataLoader( train_db, batch_size=self.extractor_args.batch_size, shuffle=True, **self.kwargs) self.eval_loader = torch.utils.data.DataLoader( eval_db, batch_size=self.extractor_args.batch_size, shuffle=True, **self.kwargs) self.extractor_mean_train_loss = 0 self.extractor_mean_eval_loss = 0
def __init__(self, model_dir): self.model_dir = model_dir self.extractor, _ = misc.load_latest(model_dir, 'extractor') self.autoencoder, _ = misc.load_latest(model_dir, 'autoencoder') self.extractor_args = self.extractor.args() use_cuda = not self.extractor_args.no_cuda and torch.cuda.is_available( ) self.device = torch.device("cuda" if use_cuda else "cpu") self.extractor.to(self.device) self.autoencoder.to(self.device) self.kwargs = { 'num_workers': 1, 'pin_memory': True } if use_cuda else {} data_transform = self.extractor.data_transform() target_transform = self.extractor.target_transform() eval_db = RirnetDatabase(is_training=False, args=self.extractor_args, data_transform=data_transform, target_transform=target_transform) self.eval_loader = torch.utils.data.DataLoader( eval_db, batch_size=self.extractor_args.batch_size, shuffle=True, **self.kwargs) self.audio_anechoic, self.fs = au.read_wav( '../../audio/harvard/male.wav')
def __init__(self, model_dir): self.model_dir = model_dir sys.path.append(model_dir) self.autoencoder, self.epoch = misc.load_latest(model_dir, 'autoencoder') self.autoencoder_args = self.autoencoder.args() use_cuda = not self.autoencoder_args.no_cuda and torch.cuda.is_available() self.device = torch.device("cuda" if use_cuda else "cpu") self.autoencoder.to(self.device) self.kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} x = torch.linspace(-2, 2, 256) weight = (1-torch.exp(2*x))/(4*(1+torch.exp(2*x))) weight += 1 - weight[0] weight = weight.repeat(self.autoencoder_args.batch_size, 2, 1) self.mse_weight = weight.cuda() self.autoencoder_optimizer = optim.Adam(self.autoencoder.parameters(), lr=self.autoencoder_args.lr, betas=(0.9, 0.99), eps=1e-5, weight_decay=0, amsgrad=False) if self.epoch != 0: self.autoencoder_optimizer.load_state_dict(torch.load(os.path.join(model_dir, '{}_opt_autoencoder.pth'.format(self.epoch)))) for g in self.autoencoder_optimizer.param_groups: g['lr'] = self.autoencoder_args.lr g['momentum'] = self.autoencoder_args.momentum data_transform = self.autoencoder.data_transform() target_transform = self.autoencoder.target_transform() train_db = RirnetDatabase(is_training = True, args = self.autoencoder_args, data_transform = data_transform, target_transform = target_transform) eval_db = RirnetDatabase(is_training = False, args = self.autoencoder_args, data_transform = data_transform, target_transform = target_transform) self.train_loader = torch.utils.data.DataLoader(train_db, batch_size=self.autoencoder_args.batch_size, shuffle=True, **self.kwargs) self.eval_loader = torch.utils.data.DataLoader(eval_db, batch_size=self.autoencoder_args.batch_size, shuffle=True, **self.kwargs) self.autoencoder_mean_train_loss = 0 self.autoencoder_mean_eval_loss = 0
def main(): net, _ = misc.load_latest('/home/eriklarsson/rirnet/timeconv/models', 'net') fs = 16384 n_fft = 128 sound_engine = SoundEngine('/home/eriklarsson/rirnet/audio/chamber/val', 44100) anechoic_signal = sound_engine.random() rir_real, _ = au.read_wav('/home/eriklarsson/rirnet/audio/rirs/lecture.wav', 44100) rir_real = rir_real[:44100//2] rev_real = au.resample(au.convolve(rir_real, anechoic_signal), 44100, fs) _, _, rev_spectrogram = sp.signal.stft(rev_real, fs=fs, nfft=n_fft, nperseg=n_fft) net_input = torch.from_numpy(-np.log(np.abs(rev_spectrogram))).unsqueeze(0).float() with torch.no_grad(): net_output = net(net_input).squeeze().numpy() phase = np.exp(1j*np.random.uniform(low = -np.pi, high = np.pi, size = np.shape(net_output))) _, rir_net = sp.signal.istft(net_output*phase, fs, nfft=n_fft, nperseg=n_fft) plt.imshow(net_output) plt.show() rir_net = au.resample(rir_net, fs, 44100) anechoic_test, _ = au.read_wav('/home/eriklarsson/rirnet/audio/harvard/male.wav') anechoic_test = anechoic_test[250000:400000,0] rev_real_test = au.convolve(rir_real, anechoic_test) rev_net_test = au.convolve(rir_net, anechoic_test) au.save_wav('real.wav', rev_real_test, 44100, True) au.save_wav('net.wav', rev_net_test, 44100, True)
def __init__(self, model_dir): sys.path.append('../../nanonet/rirnet') from rirnet_database import RirnetDatabase print(sys.path) self.model_dir = model_dir self.extractor, _ = misc.load_latest(model_dir, 'extractor') self.autoencoder, _ = misc.load_latest(model_dir, 'autoencoder') self.extractor_args = self.extractor.args() use_cuda = not self.extractor_args.no_cuda and torch.cuda.is_available( ) self.device = torch.device("cuda" if use_cuda else "cpu") self.extractor.to(self.device) self.autoencoder.to(self.device) self.kwargs = { 'num_workers': 1, 'pin_memory': True } if use_cuda else {} data_transform = transforms.Compose([ ToNormalized('../../database/mean.npy', '../../database/std.npy') ]) target_transform = transforms.Compose( [ToNegativeLog(), ToUnitNorm(), ToTensor()]) self.extractor_args.val_db_path = '../../database/db-val.csv' eval_db = RirnetDatabase(is_training=False, args=self.extractor_args, data_transform=data_transform, target_transform=target_transform) self.eval_loader = torch.utils.data.DataLoader( eval_db, batch_size=self.extractor_args.batch_size, shuffle=True, **self.kwargs) self.audio_anechoic, self.fs = au.read_wav( '../../audio/harvard/male.wav')
def __init__(self, model_dir): self.model_dir = model_dir sys.path.append(model_dir) self.net, self.epoch = misc.load_latest(model_dir, 'net') self._args = self.net.args() use_cuda = not self._args.no_cuda and torch.cuda.is_available() self.device = torch.device("cuda" if use_cuda else "cpu") self.net.to(self.device) self.kwargs = { 'num_workers': 1, 'pin_memory': True } if use_cuda else {} self.net_optimizer = optim.Adam(self.net.parameters(), lr=self._args.lr, betas=(0.9, 0.99), eps=1e-5, weight_decay=0, amsgrad=False) data_transform = self.net.data_transform() target_transform = self.net.target_transform() train_db = RirnetDatabase(is_training=True, args=self._args, data_transform=data_transform, target_transform=target_transform) eval_db = RirnetDatabase(is_training=False, args=self._args, data_transform=data_transform, target_transform=target_transform) self.train_loader = torch.utils.data.DataLoader( train_db, batch_size=self._args.batch_size, shuffle=True, **self.kwargs) self.eval_loader = torch.utils.data.DataLoader( eval_db, batch_size=self._args.batch_size, shuffle=True, **self.kwargs) self.net_mean_train_loss = 0 self.net_mean_eval_loss = 0
import rirnet.acoustic_utils as au import rirnet.misc as misc import matplotlib.pyplot as plt import numpy as np import torch import matplotlib.pyplot as plt import scipy as sp fs = 16384 signal, _ = au.read_wav('../../audio/livingroom/full/mario.wav', fs) start = np.max(np.random.randint(signal.shape[0] - fs), 0) snippet = signal[start:start + fs] net, _ = misc.load_latest('../models', 'net') net.to("cuda") a = True while a: start = np.max(np.random.randint(signal.shape[0] - fs), 0) snippet = signal[start:start + fs] output = au.split_signal(signal, rate=fs, segment_length=fs // 4, min_energy=10, max_energy=20, hop_length=128, debug=False) if len(output) > 0:
def __init__(self, model_dir): model_dir = os.path.abspath(model_dir) self.extractor, _ = misc.load_latest(model_dir, 'extractor') self.autoencoder, _ = misc.load_latest(model_dir, 'autoencoder') self.extractor = self.extractor.double().eval() self.autoencoder = self.autoencoder.double().eval()
def main(): net_timeconv, _ = misc.load_latest( '/home/felix/rirnet/timeconv_felix/models', 'net') net_peaks_ae, _ = misc.load_latest('/home/felix/rirnet/nanonet/models/16', 'autoencoder') net_peaks_ext, _ = misc.load_latest('/home/felix/rirnet/nanonet/models/16', 'extractor') fs_peaks = 44100 fs_timeconv = 44100 n_fft = 128 sound_engine = SoundEngine('/home/felix/rirnet/audio/chamber/val', fs_peaks) material_engine = MaterialEngine('/home/felix/rirnet/wip/materials.csv', '/home/felix/rirnet/wip/surfaces.csv') for i in range(15): x = np.random.uniform(3, 15) y = np.random.uniform(3, 15) z = np.random.uniform(2, 4) mic_pos = rg.generate_pos_in_rect(x, y, z, 1) source_pos = rg.generate_pos_in_rect(x, y, z, 1)[0] abs_coeffs, info = material_engine.random() info.append(str(x)) info.append(str(y)) info.append(str(z)) with open('cases_synthetic/info_{}.txt'.format(i), "w") as text_file: for elem in info: text_file.write(elem + '\n') multiband_rir = rg.generate_multiband_rirs(x, y, z, mic_pos, source_pos, fs_timeconv, 60, abs_coeffs)[0] monoband_rir = generate_monoband_rir(x, y, z, mic_pos, source_pos, fs_peaks, 8, abs_coeffs) an_sig_peaks = sound_engine.random() an_sig_timeconv = au.resample(an_sig_peaks, 44100, fs_timeconv) rev_sig_multi = sp.signal.fftconvolve(multiband_rir, an_sig_timeconv) _, _, rev_sig_multi_spectrogram = sp.signal.stft(rev_sig_multi, fs=fs_timeconv, nfft=n_fft, nperseg=n_fft) _, _, multiband_rir_spectrogram = sp.signal.stft(multiband_rir, fs=fs_timeconv, nfft=n_fft, nperseg=n_fft) input_timeconv = torch.from_numpy( -np.log(np.abs(rev_sig_multi_spectrogram))).unsqueeze(0).float() multiband_rir_spectrogram = np.abs(multiband_rir_spectrogram) with torch.no_grad(): output_timeconv = net_timeconv(input_timeconv).squeeze().numpy() print(np.max(output_timeconv)) output_timeconv /= np.max(output_timeconv) print(np.max(multiband_rir_spectrogram)) multiband_rir_spectrogram /= np.max(multiband_rir_spectrogram) plt.subplot(221) plt.imshow(np.abs(output_timeconv)) plt.subplot(222) plt.imshow(np.abs(multiband_rir_spectrogram)) phase = np.exp(1j * np.random.uniform( low=-np.pi, high=np.pi, size=np.shape(output_timeconv))) _, output_timeconv = sp.signal.istft(np.abs(output_timeconv) * phase, fs=44100, nperseg=128, noverlap=64) phase = np.exp(1j * np.random.uniform( low=-np.pi, high=np.pi, size=np.shape(multiband_rir_spectrogram))) _, multiband_rir = sp.signal.istft(np.abs(multiband_rir_spectrogram) * phase, fs=44100, nperseg=128, noverlap=64) plt.subplot(223) plt.plot(output_timeconv) plt.subplot(224) plt.plot(multiband_rir) sounds = glob.glob("/home/felix/rirnet/audio/harvard/cases/*.wav") random_sound1, random_sound2 = random.sample(set(sounds), 2) test_sound, _ = librosa.core.load(random_sound1, sr=44100) ref_sound, _ = librosa.core.load(random_sound2, sr=44100) test_output = sp.signal.fftconvolve(test_sound, output_timeconv) test_output /= np.max(np.abs(test_output)) test_output *= 2147483647 test_output = np.asarray(test_output, dtype=np.int32) test_input = sp.signal.fftconvolve(test_sound, multiband_rir) test_input /= np.max(np.abs(test_input)) test_input *= 2147483647 test_input = np.asarray(test_input, dtype=np.int32) ref_sound_rev = sp.signal.fftconvolve(ref_sound, multiband_rir) ref_sound_rev /= np.max(np.abs(ref_sound_rev)) ref_sound_rev *= 2147483647 ref_sound_rev = np.asarray(ref_sound_rev, dtype=np.int32) #test_output = au.resample(test_output, fs_timeconv, 44100) #test_input = au.resample(test_input, fs_timeconv, 44100) plt.savefig('spects_{}.png'.format(i)) plt.close() sp.io.wavfile.write('cases_synthetic/test_output_{}.wav'.format(i), fs_timeconv, test_output) sp.io.wavfile.write('cases_synthetic/test_input_{}.wav'.format(i), fs_timeconv, test_input) sp.io.wavfile.write('cases_synthetic/test_reference_{}.wav'.format(i), fs_timeconv, ref_sound_rev)
def main(): net_timeconv, _ = misc.load_latest('/home/felix/rirnet/timeconv/models', 'net') net_peaks_ae, _ = misc.load_latest('/home/felix/rirnet/nanonet/models/16', 'autoencoder') net_peaks_ext, _ = misc.load_latest('/home/felix/rirnet/nanonet/models/16', 'extractor') x, y, z = 6, 9, 3 mic_pos = rg.generate_pos_in_rect(x, y, z, 1) source_pos = rg.generate_pos_in_rect(x, y, z, 1)[0] fs_peaks = 44100 fs_timeconv = 16384 n_fft = 128 sound_engine = SoundEngine('/home/felix/rirnet/audio/chamber/val', fs_peaks) material_engine = MaterialEngine('/home/felix/rirnet/wip/materials.csv', '/home/felix/rirnet/wip/surfaces.csv') abs_coeffs = material_engine.random() multiband_rir = rg.generate_multiband_rirs(x, y, z, mic_pos, source_pos, fs_timeconv, 60, abs_coeffs)[0] monoband_rir = generate_monoband_rir(x, y, z, mic_pos, source_pos, fs_peaks, 8, abs_coeffs) an_sig_peaks = sound_engine.random() an_sig_timeconv = au.resample(an_sig_peaks, fs_peaks, fs_timeconv) rev_sig_multi = au.convolve(multiband_rir, an_sig_timeconv) _, _, rev_sig_multi_spectrogram = sp.signal.stft(rev_sig_multi, fs=fs_timeconv, nfft=n_fft, nperseg=n_fft) input_timeconv = torch.from_numpy( -np.log(np.abs(rev_sig_multi_spectrogram))).unsqueeze(0).float() rev_sig_mono = au.pad_to(au.convolve(monoband_rir, an_sig_peaks), 2**16) input_peaks = preprocess_peaks(rev_sig_mono, fs_peaks) with torch.no_grad(): output_timeconv = net_timeconv(input_timeconv).squeeze().numpy() output_peaks = net_peaks_ae(net_peaks_ext(input_peaks), decode=True).squeeze().numpy() plt.figure() plt.imshow(output_timeconv) plt.show() phase = np.exp(1j * np.random.uniform( low=-np.pi, high=np.pi, size=np.shape(output_timeconv))) _, output_timeconv = sp.signal.istft(output_timeconv * phase, fs_timeconv, nfft=n_fft, nperseg=n_fft) plt.subplot(221) plt.plot(output_timeconv) plt.subplot(222) rev_output = au.convolve(output_timeconv, an_sig_timeconv) plt.plot(rev_output / np.max(np.abs(rev_output))) #plt.scatter(output_peaks[0], output_peaks[1]) plt.subplot(223) plt.plot(multiband_rir) plt.subplot(224) plt.plot(rev_sig_multi / np.max(np.abs(rev_sig_multi))) plt.show() au.save_wav('synthetic.wav', rev_output, fs_timeconv, True) au.save_wav('tru.wav', rev_sig_multi, fs_timeconv, True)
def main(): net_timeconv, _ = misc.load_latest('/home/felix/rirnet/timeconv_felix/models', 'net') fs_peaks = 44100 fs_timeconv = 44100 n_fft = 128 sound_engine = SoundEngine('/home/felix/rirnet/audio/chamber/val', fs_peaks) for i in range(2): real_rir_path = '/home/felix/rirnet/audio/rirs/meeting.wav' real_rir, _ = librosa.core.load(real_rir_path, sr=44100) ind_1st_nonzero = next((i for i, x in enumerate(real_rir) if x>1e-3), None) real_rir[0:-ind_1st_nonzero] = real_rir[ind_1st_nonzero:] real_rir = real_rir[:7350]/3 multiband_rir = real_rir an_sig_peaks = sound_engine.random() an_sig_timeconv = au.resample(an_sig_peaks, 44100, fs_timeconv) rev_sig_multi = sp.signal.fftconvolve(multiband_rir, an_sig_timeconv) _, _, rev_sig_multi_spectrogram = sp.signal.stft(rev_sig_multi, fs=fs_timeconv, nfft=n_fft, nperseg=n_fft) _, _, multiband_rir_spectrogram = sp.signal.stft(multiband_rir, fs=fs_timeconv, nfft=n_fft, nperseg=n_fft) input_timeconv = torch.from_numpy(-np.log(np.abs(rev_sig_multi_spectrogram))).unsqueeze(0).float() multiband_rir_spectrogram = np.abs(multiband_rir_spectrogram) with torch.no_grad(): output_timeconv = net_timeconv(input_timeconv).squeeze().numpy() print(np.max(output_timeconv)) output_timeconv /= np.max(output_timeconv) print(np.max(multiband_rir_spectrogram)) multiband_rir_spectrogram /= np.max(multiband_rir_spectrogram) plt.subplot(221) plt.imshow(np.abs(output_timeconv)) plt.subplot(222) plt.imshow(np.abs(multiband_rir_spectrogram)) phase = np.exp(1j*np.random.uniform(low=-np.pi, high=np.pi, size=np.shape(output_timeconv))) _, output_timeconv = sp.signal.istft(np.abs(output_timeconv)*phase, fs=44100, nperseg=128, noverlap = 64) phase = np.exp(1j*np.random.uniform(low=-np.pi, high=np.pi, size=np.shape(multiband_rir_spectrogram))) _, multiband_rir = sp.signal.istft(np.abs(multiband_rir_spectrogram)*phase, fs=44100, nperseg=128, noverlap = 64) plt.subplot(223) plt.plot(output_timeconv) plt.subplot(224) plt.plot(multiband_rir) sounds = glob.glob("/home/felix/rirnet/audio/harvard/cases/*.wav") random_sound1, random_sound2 = random.sample(set(sounds), 2) test_sound,_ = librosa.core.load(random_sound1, sr=44100) ref_sound,_ = librosa.core.load(random_sound2, sr=44100) test_output = sp.signal.fftconvolve(test_sound, output_timeconv) test_output /= np.max(np.abs(test_output)) test_output *= 2147483647 test_output = np.asarray(test_output, dtype=np.int32) test_input = sp.signal.fftconvolve(test_sound, multiband_rir) test_input /= np.max(np.abs(test_input)) test_input *= 2147483647 test_input = np.asarray(test_input, dtype=np.int32) plt.savefig('spects_{}.png'.format(i)) plt.close() sp.io.wavfile.write('cases_real/test_meeting_output_{}.wav'.format(i), fs_timeconv, test_output) sp.io.wavfile.write('cases_real/test_meeting_input_{}.wav'.format(i), fs_timeconv, test_input)
def __init__(self, model_dir): self.model_dir = model_dir sys.path.append(model_dir) self.net, self.epoch = misc.load_latest(model_dir, 'net') self._args = self.net.args()