Example #1
0
    def __init__(self, model_dir):
        self.model_dir = model_dir
        sys.path.append(model_dir)
        extractor = import_module('extractor')
        self.extractor, self.epoch = misc.load_latest(model_dir, 'extractor')
        self.autoenc, _ = misc.load_latest(model_dir, 'autoencoder')
        self.extractor_args = self.extractor.args()

        use_cuda = not self.extractor_args.no_cuda and torch.cuda.is_available(
        )
        self.device = torch.device("cuda" if use_cuda else "cpu")
        self.extractor.to(self.device)
        self.autoenc.to(self.device)
        self.kwargs = {
            'num_workers': 1,
            'pin_memory': True
        } if use_cuda else {}

        self.extractor_optimizer = optim.Adam(self.extractor.parameters(),
                                              lr=self.extractor_args.lr,
                                              betas=(0.9, 0.99),
                                              eps=1e-5,
                                              weight_decay=0,
                                              amsgrad=False)

        self.best_eval_loss = np.inf
        self.best_eval_loss_latent = np.inf

        if self.epoch != 0:
            self.extractor_optimizer.load_state_dict(
                torch.load(
                    os.path.join(model_dir,
                                 '{}_opt_extractor.pth'.format(self.epoch))))
            for g in self.extractor_optimizer.param_groups:
                g['lr'] = self.extractor_args.lr

        data_transform = self.extractor.data_transform()
        target_transform = self.extractor.target_transform()

        train_db = RirnetDatabase(is_training=True,
                                  args=self.extractor_args,
                                  data_transform=data_transform,
                                  target_transform=target_transform)
        eval_db = RirnetDatabase(is_training=False,
                                 args=self.extractor_args,
                                 data_transform=data_transform,
                                 target_transform=target_transform)
        self.train_loader = torch.utils.data.DataLoader(
            train_db,
            batch_size=self.extractor_args.batch_size,
            shuffle=True,
            **self.kwargs)
        self.eval_loader = torch.utils.data.DataLoader(
            eval_db,
            batch_size=self.extractor_args.batch_size,
            shuffle=True,
            **self.kwargs)

        self.extractor_mean_train_loss = 0
        self.extractor_mean_eval_loss = 0
Example #2
0
    def __init__(self, model_dir):
        self.model_dir = model_dir

        self.extractor, _ = misc.load_latest(model_dir, 'extractor')
        self.autoencoder, _ = misc.load_latest(model_dir, 'autoencoder')

        self.extractor_args = self.extractor.args()

        use_cuda = not self.extractor_args.no_cuda and torch.cuda.is_available(
        )
        self.device = torch.device("cuda" if use_cuda else "cpu")
        self.extractor.to(self.device)
        self.autoencoder.to(self.device)
        self.kwargs = {
            'num_workers': 1,
            'pin_memory': True
        } if use_cuda else {}

        data_transform = self.extractor.data_transform()
        target_transform = self.extractor.target_transform()

        eval_db = RirnetDatabase(is_training=False,
                                 args=self.extractor_args,
                                 data_transform=data_transform,
                                 target_transform=target_transform)
        self.eval_loader = torch.utils.data.DataLoader(
            eval_db,
            batch_size=self.extractor_args.batch_size,
            shuffle=True,
            **self.kwargs)

        self.audio_anechoic, self.fs = au.read_wav(
            '../../audio/harvard/male.wav')
Example #3
0
    def __init__(self, model_dir):
        self.model_dir = model_dir
        sys.path.append(model_dir)
        self.autoencoder, self.epoch = misc.load_latest(model_dir, 'autoencoder')
        self.autoencoder_args = self.autoencoder.args()
        use_cuda = not self.autoencoder_args.no_cuda and torch.cuda.is_available()
        self.device = torch.device("cuda" if use_cuda else "cpu")
        self.autoencoder.to(self.device)
        self.kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

        x = torch.linspace(-2, 2, 256)
        weight = (1-torch.exp(2*x))/(4*(1+torch.exp(2*x)))
        weight += 1 - weight[0]
        weight = weight.repeat(self.autoencoder_args.batch_size, 2, 1)
        self.mse_weight = weight.cuda()

        self.autoencoder_optimizer = optim.Adam(self.autoencoder.parameters(), lr=self.autoencoder_args.lr, betas=(0.9, 0.99), eps=1e-5, weight_decay=0, amsgrad=False)

        if self.epoch != 0:
            self.autoencoder_optimizer.load_state_dict(torch.load(os.path.join(model_dir, '{}_opt_autoencoder.pth'.format(self.epoch))))

            for g in self.autoencoder_optimizer.param_groups:
                g['lr'] = self.autoencoder_args.lr
                g['momentum'] = self.autoencoder_args.momentum

        data_transform = self.autoencoder.data_transform()
        target_transform = self.autoencoder.target_transform()

        train_db = RirnetDatabase(is_training = True, args = self.autoencoder_args, data_transform = data_transform, target_transform = target_transform)
        eval_db = RirnetDatabase(is_training = False, args = self.autoencoder_args, data_transform = data_transform, target_transform = target_transform)
        self.train_loader = torch.utils.data.DataLoader(train_db, batch_size=self.autoencoder_args.batch_size, shuffle=True, **self.kwargs)
        self.eval_loader = torch.utils.data.DataLoader(eval_db, batch_size=self.autoencoder_args.batch_size, shuffle=True, **self.kwargs)

        self.autoencoder_mean_train_loss = 0
        self.autoencoder_mean_eval_loss = 0
Example #4
0
def main():
    net, _ = misc.load_latest('/home/eriklarsson/rirnet/timeconv/models', 'net')
    
    fs = 16384
    n_fft = 128

    sound_engine = SoundEngine('/home/eriklarsson/rirnet/audio/chamber/val', 44100)
    anechoic_signal = sound_engine.random()

    rir_real, _ = au.read_wav('/home/eriklarsson/rirnet/audio/rirs/lecture.wav', 44100)
    rir_real = rir_real[:44100//2]
    rev_real = au.resample(au.convolve(rir_real, anechoic_signal), 44100, fs)
    
    _, _, rev_spectrogram = sp.signal.stft(rev_real, fs=fs, nfft=n_fft, nperseg=n_fft)
    net_input = torch.from_numpy(-np.log(np.abs(rev_spectrogram))).unsqueeze(0).float()

    with torch.no_grad():
        net_output = net(net_input).squeeze().numpy()
    phase = np.exp(1j*np.random.uniform(low = -np.pi, high = np.pi, size = np.shape(net_output)))
    _, rir_net = sp.signal.istft(net_output*phase, fs, nfft=n_fft, nperseg=n_fft)
    plt.imshow(net_output)
    plt.show()
    rir_net = au.resample(rir_net, fs, 44100)

    anechoic_test, _ = au.read_wav('/home/eriklarsson/rirnet/audio/harvard/male.wav')
    anechoic_test = anechoic_test[250000:400000,0]

    rev_real_test = au.convolve(rir_real, anechoic_test)
    rev_net_test = au.convolve(rir_net, anechoic_test)
    
    au.save_wav('real.wav', rev_real_test, 44100, True)
    au.save_wav('net.wav', rev_net_test, 44100, True)
Example #5
0
    def __init__(self, model_dir):
        sys.path.append('../../nanonet/rirnet')
        from rirnet_database import RirnetDatabase
        print(sys.path)
        self.model_dir = model_dir

        self.extractor, _ = misc.load_latest(model_dir, 'extractor')
        self.autoencoder, _ = misc.load_latest(model_dir, 'autoencoder')

        self.extractor_args = self.extractor.args()

        use_cuda = not self.extractor_args.no_cuda and torch.cuda.is_available(
        )
        self.device = torch.device("cuda" if use_cuda else "cpu")
        self.extractor.to(self.device)
        self.autoencoder.to(self.device)
        self.kwargs = {
            'num_workers': 1,
            'pin_memory': True
        } if use_cuda else {}

        data_transform = transforms.Compose([
            ToNormalized('../../database/mean.npy', '../../database/std.npy')
        ])
        target_transform = transforms.Compose(
            [ToNegativeLog(), ToUnitNorm(),
             ToTensor()])

        self.extractor_args.val_db_path = '../../database/db-val.csv'

        eval_db = RirnetDatabase(is_training=False,
                                 args=self.extractor_args,
                                 data_transform=data_transform,
                                 target_transform=target_transform)
        self.eval_loader = torch.utils.data.DataLoader(
            eval_db,
            batch_size=self.extractor_args.batch_size,
            shuffle=True,
            **self.kwargs)

        self.audio_anechoic, self.fs = au.read_wav(
            '../../audio/harvard/male.wav')
Example #6
0
    def __init__(self, model_dir):
        self.model_dir = model_dir
        sys.path.append(model_dir)
        self.net, self.epoch = misc.load_latest(model_dir, 'net')
        self._args = self.net.args()
        use_cuda = not self._args.no_cuda and torch.cuda.is_available()
        self.device = torch.device("cuda" if use_cuda else "cpu")
        self.net.to(self.device)
        self.kwargs = {
            'num_workers': 1,
            'pin_memory': True
        } if use_cuda else {}

        self.net_optimizer = optim.Adam(self.net.parameters(),
                                        lr=self._args.lr,
                                        betas=(0.9, 0.99),
                                        eps=1e-5,
                                        weight_decay=0,
                                        amsgrad=False)

        data_transform = self.net.data_transform()
        target_transform = self.net.target_transform()

        train_db = RirnetDatabase(is_training=True,
                                  args=self._args,
                                  data_transform=data_transform,
                                  target_transform=target_transform)
        eval_db = RirnetDatabase(is_training=False,
                                 args=self._args,
                                 data_transform=data_transform,
                                 target_transform=target_transform)
        self.train_loader = torch.utils.data.DataLoader(
            train_db,
            batch_size=self._args.batch_size,
            shuffle=True,
            **self.kwargs)
        self.eval_loader = torch.utils.data.DataLoader(
            eval_db,
            batch_size=self._args.batch_size,
            shuffle=True,
            **self.kwargs)

        self.net_mean_train_loss = 0
        self.net_mean_eval_loss = 0
Example #7
0
import rirnet.acoustic_utils as au
import rirnet.misc as misc
import matplotlib.pyplot as plt
import numpy as np
import torch
import matplotlib.pyplot as plt
import scipy as sp

fs = 16384

signal, _ = au.read_wav('../../audio/livingroom/full/mario.wav', fs)
start = np.max(np.random.randint(signal.shape[0] - fs), 0)
snippet = signal[start:start + fs]

net, _ = misc.load_latest('../models', 'net')
net.to("cuda")

a = True

while a:
    start = np.max(np.random.randint(signal.shape[0] - fs), 0)
    snippet = signal[start:start + fs]

    output = au.split_signal(signal,
                             rate=fs,
                             segment_length=fs // 4,
                             min_energy=10,
                             max_energy=20,
                             hop_length=128,
                             debug=False)
    if len(output) > 0:
Example #8
0
 def __init__(self, model_dir):
     model_dir = os.path.abspath(model_dir)
     self.extractor, _ = misc.load_latest(model_dir, 'extractor')
     self.autoencoder, _ = misc.load_latest(model_dir, 'autoencoder')
     self.extractor = self.extractor.double().eval()
     self.autoencoder = self.autoencoder.double().eval()
Example #9
0
def main():

    net_timeconv, _ = misc.load_latest(
        '/home/felix/rirnet/timeconv_felix/models', 'net')
    net_peaks_ae, _ = misc.load_latest('/home/felix/rirnet/nanonet/models/16',
                                       'autoencoder')
    net_peaks_ext, _ = misc.load_latest('/home/felix/rirnet/nanonet/models/16',
                                        'extractor')

    fs_peaks = 44100
    fs_timeconv = 44100
    n_fft = 128

    sound_engine = SoundEngine('/home/felix/rirnet/audio/chamber/val',
                               fs_peaks)
    material_engine = MaterialEngine('/home/felix/rirnet/wip/materials.csv',
                                     '/home/felix/rirnet/wip/surfaces.csv')
    for i in range(15):
        x = np.random.uniform(3, 15)
        y = np.random.uniform(3, 15)
        z = np.random.uniform(2, 4)
        mic_pos = rg.generate_pos_in_rect(x, y, z, 1)
        source_pos = rg.generate_pos_in_rect(x, y, z, 1)[0]

        abs_coeffs, info = material_engine.random()
        info.append(str(x))
        info.append(str(y))
        info.append(str(z))

        with open('cases_synthetic/info_{}.txt'.format(i), "w") as text_file:
            for elem in info:
                text_file.write(elem + '\n')

        multiband_rir = rg.generate_multiband_rirs(x, y, z, mic_pos,
                                                   source_pos, fs_timeconv, 60,
                                                   abs_coeffs)[0]
        monoband_rir = generate_monoband_rir(x, y, z, mic_pos, source_pos,
                                             fs_peaks, 8, abs_coeffs)

        an_sig_peaks = sound_engine.random()
        an_sig_timeconv = au.resample(an_sig_peaks, 44100, fs_timeconv)

        rev_sig_multi = sp.signal.fftconvolve(multiband_rir, an_sig_timeconv)
        _, _, rev_sig_multi_spectrogram = sp.signal.stft(rev_sig_multi,
                                                         fs=fs_timeconv,
                                                         nfft=n_fft,
                                                         nperseg=n_fft)
        _, _, multiband_rir_spectrogram = sp.signal.stft(multiband_rir,
                                                         fs=fs_timeconv,
                                                         nfft=n_fft,
                                                         nperseg=n_fft)
        input_timeconv = torch.from_numpy(
            -np.log(np.abs(rev_sig_multi_spectrogram))).unsqueeze(0).float()

        multiband_rir_spectrogram = np.abs(multiband_rir_spectrogram)
        with torch.no_grad():
            output_timeconv = net_timeconv(input_timeconv).squeeze().numpy()

        print(np.max(output_timeconv))
        output_timeconv /= np.max(output_timeconv)
        print(np.max(multiband_rir_spectrogram))
        multiband_rir_spectrogram /= np.max(multiband_rir_spectrogram)

        plt.subplot(221)
        plt.imshow(np.abs(output_timeconv))
        plt.subplot(222)
        plt.imshow(np.abs(multiband_rir_spectrogram))

        phase = np.exp(1j * np.random.uniform(
            low=-np.pi, high=np.pi, size=np.shape(output_timeconv)))
        _, output_timeconv = sp.signal.istft(np.abs(output_timeconv) * phase,
                                             fs=44100,
                                             nperseg=128,
                                             noverlap=64)
        phase = np.exp(1j * np.random.uniform(
            low=-np.pi, high=np.pi, size=np.shape(multiband_rir_spectrogram)))
        _, multiband_rir = sp.signal.istft(np.abs(multiband_rir_spectrogram) *
                                           phase,
                                           fs=44100,
                                           nperseg=128,
                                           noverlap=64)

        plt.subplot(223)
        plt.plot(output_timeconv)
        plt.subplot(224)
        plt.plot(multiband_rir)

        sounds = glob.glob("/home/felix/rirnet/audio/harvard/cases/*.wav")
        random_sound1, random_sound2 = random.sample(set(sounds), 2)
        test_sound, _ = librosa.core.load(random_sound1, sr=44100)
        ref_sound, _ = librosa.core.load(random_sound2, sr=44100)

        test_output = sp.signal.fftconvolve(test_sound, output_timeconv)
        test_output /= np.max(np.abs(test_output))
        test_output *= 2147483647
        test_output = np.asarray(test_output, dtype=np.int32)

        test_input = sp.signal.fftconvolve(test_sound, multiband_rir)
        test_input /= np.max(np.abs(test_input))
        test_input *= 2147483647
        test_input = np.asarray(test_input, dtype=np.int32)

        ref_sound_rev = sp.signal.fftconvolve(ref_sound, multiband_rir)
        ref_sound_rev /= np.max(np.abs(ref_sound_rev))
        ref_sound_rev *= 2147483647
        ref_sound_rev = np.asarray(ref_sound_rev, dtype=np.int32)

        #test_output = au.resample(test_output, fs_timeconv, 44100)
        #test_input = au.resample(test_input, fs_timeconv, 44100)
        plt.savefig('spects_{}.png'.format(i))
        plt.close()
        sp.io.wavfile.write('cases_synthetic/test_output_{}.wav'.format(i),
                            fs_timeconv, test_output)
        sp.io.wavfile.write('cases_synthetic/test_input_{}.wav'.format(i),
                            fs_timeconv, test_input)
        sp.io.wavfile.write('cases_synthetic/test_reference_{}.wav'.format(i),
                            fs_timeconv, ref_sound_rev)
Example #10
0
def main():
    net_timeconv, _ = misc.load_latest('/home/felix/rirnet/timeconv/models',
                                       'net')
    net_peaks_ae, _ = misc.load_latest('/home/felix/rirnet/nanonet/models/16',
                                       'autoencoder')
    net_peaks_ext, _ = misc.load_latest('/home/felix/rirnet/nanonet/models/16',
                                        'extractor')

    x, y, z = 6, 9, 3
    mic_pos = rg.generate_pos_in_rect(x, y, z, 1)
    source_pos = rg.generate_pos_in_rect(x, y, z, 1)[0]
    fs_peaks = 44100
    fs_timeconv = 16384
    n_fft = 128

    sound_engine = SoundEngine('/home/felix/rirnet/audio/chamber/val',
                               fs_peaks)
    material_engine = MaterialEngine('/home/felix/rirnet/wip/materials.csv',
                                     '/home/felix/rirnet/wip/surfaces.csv')
    abs_coeffs = material_engine.random()

    multiband_rir = rg.generate_multiband_rirs(x, y, z, mic_pos, source_pos,
                                               fs_timeconv, 60, abs_coeffs)[0]
    monoband_rir = generate_monoband_rir(x, y, z, mic_pos, source_pos,
                                         fs_peaks, 8, abs_coeffs)

    an_sig_peaks = sound_engine.random()
    an_sig_timeconv = au.resample(an_sig_peaks, fs_peaks, fs_timeconv)

    rev_sig_multi = au.convolve(multiband_rir, an_sig_timeconv)
    _, _, rev_sig_multi_spectrogram = sp.signal.stft(rev_sig_multi,
                                                     fs=fs_timeconv,
                                                     nfft=n_fft,
                                                     nperseg=n_fft)
    input_timeconv = torch.from_numpy(
        -np.log(np.abs(rev_sig_multi_spectrogram))).unsqueeze(0).float()

    rev_sig_mono = au.pad_to(au.convolve(monoband_rir, an_sig_peaks), 2**16)
    input_peaks = preprocess_peaks(rev_sig_mono, fs_peaks)

    with torch.no_grad():
        output_timeconv = net_timeconv(input_timeconv).squeeze().numpy()
        output_peaks = net_peaks_ae(net_peaks_ext(input_peaks),
                                    decode=True).squeeze().numpy()
        plt.figure()
        plt.imshow(output_timeconv)
        plt.show()
    phase = np.exp(1j * np.random.uniform(
        low=-np.pi, high=np.pi, size=np.shape(output_timeconv)))
    _, output_timeconv = sp.signal.istft(output_timeconv * phase,
                                         fs_timeconv,
                                         nfft=n_fft,
                                         nperseg=n_fft)

    plt.subplot(221)
    plt.plot(output_timeconv)
    plt.subplot(222)
    rev_output = au.convolve(output_timeconv, an_sig_timeconv)
    plt.plot(rev_output / np.max(np.abs(rev_output)))
    #plt.scatter(output_peaks[0], output_peaks[1])
    plt.subplot(223)
    plt.plot(multiband_rir)
    plt.subplot(224)
    plt.plot(rev_sig_multi / np.max(np.abs(rev_sig_multi)))
    plt.show()

    au.save_wav('synthetic.wav', rev_output, fs_timeconv, True)
    au.save_wav('tru.wav', rev_sig_multi, fs_timeconv, True)
Example #11
0
def main():

    net_timeconv, _ = misc.load_latest('/home/felix/rirnet/timeconv_felix/models', 'net')

    fs_peaks = 44100
    fs_timeconv = 44100
    n_fft = 128

    sound_engine = SoundEngine('/home/felix/rirnet/audio/chamber/val', fs_peaks)
    for i in range(2):
        real_rir_path = '/home/felix/rirnet/audio/rirs/meeting.wav'
        real_rir, _ = librosa.core.load(real_rir_path, sr=44100)

        ind_1st_nonzero = next((i for i, x in enumerate(real_rir) if x>1e-3), None)
        real_rir[0:-ind_1st_nonzero] = real_rir[ind_1st_nonzero:]

        real_rir = real_rir[:7350]/3

        multiband_rir = real_rir
        an_sig_peaks = sound_engine.random()
        an_sig_timeconv = au.resample(an_sig_peaks, 44100, fs_timeconv)

        rev_sig_multi = sp.signal.fftconvolve(multiband_rir, an_sig_timeconv)
        _, _, rev_sig_multi_spectrogram = sp.signal.stft(rev_sig_multi, fs=fs_timeconv, nfft=n_fft, nperseg=n_fft)
        _, _, multiband_rir_spectrogram = sp.signal.stft(multiband_rir, fs=fs_timeconv, nfft=n_fft, nperseg=n_fft)
        input_timeconv = torch.from_numpy(-np.log(np.abs(rev_sig_multi_spectrogram))).unsqueeze(0).float()

        multiband_rir_spectrogram = np.abs(multiband_rir_spectrogram)
        with torch.no_grad():
            output_timeconv = net_timeconv(input_timeconv).squeeze().numpy()

        print(np.max(output_timeconv))
        output_timeconv /= np.max(output_timeconv)
        print(np.max(multiband_rir_spectrogram))
        multiband_rir_spectrogram /= np.max(multiband_rir_spectrogram)

        plt.subplot(221)
        plt.imshow(np.abs(output_timeconv))
        plt.subplot(222)
        plt.imshow(np.abs(multiband_rir_spectrogram))

        phase = np.exp(1j*np.random.uniform(low=-np.pi, high=np.pi, size=np.shape(output_timeconv)))
        _, output_timeconv = sp.signal.istft(np.abs(output_timeconv)*phase, fs=44100, nperseg=128, noverlap = 64)
        phase = np.exp(1j*np.random.uniform(low=-np.pi, high=np.pi, size=np.shape(multiband_rir_spectrogram)))
        _, multiband_rir = sp.signal.istft(np.abs(multiband_rir_spectrogram)*phase, fs=44100, nperseg=128, noverlap = 64)

        plt.subplot(223)
        plt.plot(output_timeconv)
        plt.subplot(224)
        plt.plot(multiband_rir)

        sounds = glob.glob("/home/felix/rirnet/audio/harvard/cases/*.wav")
        random_sound1, random_sound2 = random.sample(set(sounds), 2)
        test_sound,_ = librosa.core.load(random_sound1, sr=44100)
        ref_sound,_ = librosa.core.load(random_sound2, sr=44100)

        test_output = sp.signal.fftconvolve(test_sound, output_timeconv)
        test_output /= np.max(np.abs(test_output))
        test_output *= 2147483647
        test_output = np.asarray(test_output, dtype=np.int32)

        test_input = sp.signal.fftconvolve(test_sound, multiband_rir)
        test_input /= np.max(np.abs(test_input))
        test_input *= 2147483647
        test_input = np.asarray(test_input, dtype=np.int32)

        plt.savefig('spects_{}.png'.format(i))
        plt.close()
        sp.io.wavfile.write('cases_real/test_meeting_output_{}.wav'.format(i), fs_timeconv, test_output)
        sp.io.wavfile.write('cases_real/test_meeting_input_{}.wav'.format(i), fs_timeconv, test_input)
Example #12
0
 def __init__(self, model_dir):
     self.model_dir = model_dir
     sys.path.append(model_dir)
     self.net, self.epoch = misc.load_latest(model_dir, 'net')
     self._args = self.net.args()