Esempio n. 1
0
def evaluation():
    with torch.no_grad():

        #
        # initialize network
        #
        np.random.seed(0)
        torch.manual_seed(0)

        network_params = 0

        network = EnsembleNetwork(
            filepath_gating=file_gating,
            filepaths_denoising=files_specialists,
            g_hs=hidden_size_gating,
            g_nl=num_layers_gating,
            s_hs=hidden_size_specialist,
            s_nl=num_layers_specialist,
            ct=args.latent_space,
        ).to(device=args.device_id)

        F.write_data(filename=os.path.join(output_directory,
                                           'files_gating.txt'),
                     data=str(file_gating))
        F.write_data(filename=os.path.join(output_directory,
                                           'files_specialist.txt'),
                     data=str(files_specialists))

        with torch.cuda.device(args.device_id):
            torch.cuda.empty_cache()

        if args.latent_space == 'gender':

            te_sisdr = {str(k): 0 for k in C.gender_all}
            for te_gender in C.gender_all:

                te_batch_durations = list()
                te_batch_sisdr = list()
                files_speech = np.random.choice(F.filter_by_gender(
                    te_utterances, te_gender),
                                                size=C.te_batch_size)
                files_noise = np.random.choice(te_noises, size=C.te_batch_size)

                for (i, fs, fn) in zip(range(C.te_batch_size), files_speech,
                                       files_noise):

                    source = F.load_audio(fs,
                                          duration=None,
                                          random_offset=False,
                                          device=args.device_id)
                    noise = F.load_audio(fn,
                                         duration=None,
                                         random_offset=False,
                                         device=args.device_id)
                    min_length = min(len(source), len(noise))
                    stft_frames = ceil(min_length / C.hop_size)
                    source = source[:min_length]
                    noise = noise[:min_length]

                    (x, s, n) = F.mix_signals(source, noise, snr_db=C.snr_all)
                    (X, X_mag) = F.stft(x)

                    X = X.permute(
                        1, 0,
                        2)[:stft_frames]  # (seq_len, num_features, channel)
                    X_mag = X_mag.permute(
                        1, 0)[:stft_frames]  # (seq_len, num_features)
                    X = torch.unsqueeze(X, dim=0)
                    X_mag = torch.unsqueeze(X_mag, dim=0)
                    s = torch.unsqueeze(s, dim=0)
                    x = torch.unsqueeze(x, dim=0)

                    actual_sisdr = float(F.calculate_sisdr(s, x).item())

                    # feed-forward
                    M_hat = network(X_mag)
                    s_hat = F.istft(X, mask=M_hat)

                    te_batch_sisdr.append(
                        (F.calculate_sisdr(s, s_hat,
                                           offset=actual_sisdr).mean().item()))
                    te_batch_durations.append(min_length)

                # store the weighted average results
                te_sisdr[str(te_gender)] = np.average(
                    te_batch_sisdr, weights=te_batch_durations)

        elif args.latent_space == 'snr':

            te_sisdr = {str(k): 0 for k in C.snr_all}
            for te_snr in C.snr_all:

                te_batch_durations = list()
                te_batch_sisdr = list()
                files_speech = np.random.choice(te_utterances,
                                                size=C.te_batch_size)
                files_noise = np.random.choice(te_noises, size=C.te_batch_size)

                for (i, fs, fn) in zip(range(C.te_batch_size), files_speech,
                                       files_noise):

                    source = F.load_audio(fs,
                                          duration=None,
                                          random_offset=False,
                                          device=args.device_id)
                    noise = F.load_audio(fn,
                                         duration=None,
                                         random_offset=False,
                                         device=args.device_id)
                    min_length = min(len(source), len(noise))
                    stft_frames = ceil(min_length / C.hop_size)
                    source = source[:min_length]
                    noise = noise[:min_length]

                    (x, s, n) = F.mix_signals(source, noise, snr_db=te_snr)
                    (X, X_mag) = F.stft(x)

                    X = X.permute(
                        1, 0,
                        2)[:stft_frames]  # (seq_len, num_features, channel)
                    X_mag = X_mag.permute(
                        1, 0)[:stft_frames]  # (seq_len, num_features)
                    X = torch.unsqueeze(X, dim=0)
                    X_mag = torch.unsqueeze(X_mag, dim=0)
                    s = torch.unsqueeze(s, dim=0)
                    x = torch.unsqueeze(x, dim=0)

                    actual_sisdr = float(F.calculate_sisdr(s, x).item())

                    # feed-forward
                    M_hat = network(X_mag)
                    s_hat = F.istft(X, mask=M_hat)

                    te_batch_sisdr.append(
                        (F.calculate_sisdr(s, s_hat,
                                           offset=actual_sisdr).mean().item()))
                    te_batch_durations.append(min_length)

                # store the weighted average results
                te_sisdr[str(te_snr)] = np.average(te_batch_sisdr,
                                                   weights=te_batch_durations)

        te_sisdr['mean'] = np.mean(list(te_sisdr.values()))

        logging.info(json.dumps(te_sisdr, sort_keys=True, indent=4))
        F.write_data(filename=os.path.join(output_directory,
                                           f'test_results.txt'),
                     data=te_sisdr)

    return
Esempio n. 2
0
def evaluation():
    with torch.no_grad():

        sum_num_params = 0

        #
        # initialize gating network
        #
        gating = GatingNetwork(hidden_size_gating, num_layers_gating, len(gender_all)).to(device=args.device_id)
        gating.load_state_dict(torch.load(
            args.state_dict_file_gating, map_location=torch.device(args.device_id))
        )
        gating.eval()
        sum_num_params += F.count_parameters(gating)


        #
        # initialize specialist networks (as a hashed list of networks)
        #
        specialists = {
            i: SpecialistNetwork(hidden_size_specialist, num_layers_specialist).to(device=args.device_id)
            for i in range(len(gender_all))
        }
        for i in range(len(gender_all)):
            assert (re.search(r'gender\_[MF]', args.state_dict_file_specialist) is not None)
            filepath = re.sub(r'gender\_[MF]', F.fmt_gender(gender_all[i]), args.state_dict_file_specialist)
            specialists[i].load_state_dict(torch.load(
                filepath, map_location=torch.device(args.device_id))
            )
            specialists[i].eval()
            sum_num_params += F.count_parameters(specialists[i])


        F.write_data(filename=os.path.join(output_directory, 'num_parameters.txt'),
                     data=sum_num_params)
        with torch.cuda.device(args.device_id):
            torch.cuda.empty_cache()


        #
        # log experiment configuration
        #
        logging.info('All results will be stored in "{}".'.format(
            output_directory))
        logging.info('Testing {} model (with Gating architecture {} and Specialist architecture {}) to denoise {} gendered mixtures...'.format(
            model_name, architecture_gating, architecture_specialist, gender_all))
        logging.info('Using GPU device {}...'.format(
            args.device_id))


        fields = ['snr_val','num_mixtures','sdr','sisdr','mse','bce','accuracy']

        #
        # validation
        #
        results_validation = []
        np.random.seed(0)
        torch.manual_seed(0)
        for gender_val in gender_all:

            # construct a batch
            batch = F.generate_batch(
                vl_utterances, vl_noises,
                batch_size=vl_batch_size,
                gender=gender_val,
                device=args.device_id,
            )
            Y = batch.index_gender

            # compute batch-wise specialist probabilities
            Y_hat = gating(batch.X_mag)

            # pick the best specialist to apply to the whole batch (based on batch probabilities sum)
            k = int(Y_hat.sum(dim=0).argmax().item())

            # apply the best specialist to the entire batch
            M_hat = specialists[k](batch.X_mag)
            s_hat = F.istft(batch.X, mask=M_hat)

            results_validation.append([
                gender_val,
                vl_batch_size,
                float(F.calculate_sdr(batch.s, s_hat, offset=batch.actual_sdr).mean().item()),
                float(F.calculate_sisdr(batch.s, s_hat, offset=batch.actual_sisdr).mean().item()),
                float(F.calculate_mse(batch.M, M_hat).item()),
                float(F.calculate_bce(batch.M, M_hat).item()),
                float(F.calculate_accuracy(Y, Y_hat)),
            ])
            status = (
                f'Validation Data (Gender={gender_val}) -- ' + \
                f'SDR: {results_validation[-1][2]:>6.3f} dB, ' + \
                f'\033[33mSISDR: {results_validation[-1][3]:>6.3f} dB\033[39m, ' + \
                f'MSE: {results_validation[-1][4]:>6.3f}, ' + \
                f'BCE: {results_validation[-1][5]:>6.3f}, ' + \
                f'Accuracy: {results_validation[-1][6]:>6.3f}'
            )
            logging.info(status)
        F.write_table(filename=os.path.join(output_directory, f'validation_results.txt'),
                      table_data=results_validation, headers=fields)


        #
        # testing
        #
        results_testing = []

        for gender_val in gender_all:
            np.random.seed(0)
            torch.manual_seed(0)
            te_utterances_filtered = te_utterances[np.array([(F.get_gender(row) in gender_val) for row in te_utterances])]
            files_speech = np.random.choice(te_utterances_filtered, size=te_batch_size)
            files_noise = np.random.choice(te_noises, size=te_batch_size)
            te_m_durations = list()
            te_m_sdr = list()
            te_m_sisdr = list()
            te_m_mse = list()
            te_m_bce = list()
            te_m_accuracy = list()
            for (i, fs, fn) in zip(range(te_batch_size), files_speech, files_noise):

                source = F.load_audio(fs, duration=None, random_offset=False, device=args.device_id)
                noise = F.load_audio(fn, duration=None, random_offset=False, device=args.device_id)
                min_length = min(len(source), len(noise))
                stft_frames = ceil(min_length/hop_size)
                source = source[:min_length]
                noise = noise[:min_length]

                (x, s, n) = F.mix_signals(source, noise, snr_db=snr_all)
                (S, S_mag) = F.stft(s)
                (N, N_mag) = F.stft(n)
                (X, X_mag) = F.stft(x)
                (M) = F.calculate_masking_target(S_mag, N_mag)

                X = X.permute(1, 0, 2)[:stft_frames] # (seq_len, num_features, channel)
                S_mag = S_mag.permute(1, 0)[:stft_frames]  # (seq_len, num_features)
                N_mag = N_mag.permute(1, 0)[:stft_frames]  # (seq_len, num_features)
                X_mag = X_mag.permute(1, 0)[:stft_frames]  # (seq_len, num_features)
                M = M.permute(1, 0)[:stft_frames]  # (seq_len, num_features)

                actual_sdr = float(F.calculate_sdr(s, x).item())
                actual_sisdr = float(F.calculate_sisdr(s, x).item())

                gender_index = int(F.get_gender(fs)=='F')
                Y = torch.zeros(1, len(gender_all), device=args.device_id)
                Y[..., gender_index] = 1

                # add a fake batch axis to everything
                x = torch.unsqueeze(x, dim=0)
                s = torch.unsqueeze(s, dim=0)
                n = torch.unsqueeze(n, dim=0)
                S = torch.unsqueeze(S, dim=0)
                S_mag = torch.unsqueeze(S_mag, dim=0)
                N = torch.unsqueeze(N, dim=0)
                N_mag = torch.unsqueeze(N_mag, dim=0)
                X = torch.unsqueeze(X, dim=0)
                X_mag = torch.unsqueeze(X_mag, dim=0)
                M = torch.unsqueeze(M, dim=0)

                # compute batch-wise specialist probabilities
                Y_hat = gating(X_mag)

                # pick the best specialist to apply to the whole batch (based on batch probabilities sum)
                k = int(Y_hat.sum(dim=0).argmax().item())

                # apply the best specialist to the entire batch
                M_hat = specialists[k](X_mag)
                s_hat = F.istft(X, mask=M_hat)

                te_m_sdr.append(F.calculate_sdr(s, s_hat, offset=actual_sdr).mean().item())
                te_m_sisdr.append(F.calculate_sisdr(s, s_hat, offset=actual_sisdr).mean().item())
                te_m_mse.append(F.calculate_mse(M, M_hat).item())
                te_m_bce.append(F.calculate_bce(M, M_hat).item())
                te_m_accuracy.append(float(torch.prod(Y==torch.round(Y_hat), dim=-1).sum().item()/float(len(Y))))
                te_m_durations.append(min_length)

            # store the weighted average results
            results_testing.append([
                gender_val,
                te_batch_size,
                np.average(te_m_sdr, weights=te_m_durations),
                np.average(te_m_sisdr, weights=te_m_durations),
                np.average(te_m_mse, weights=te_m_durations),
                np.average(te_m_bce, weights=te_m_durations),
                np.average(te_m_accuracy, weights=te_m_durations),
            ])
            status = (
                f'Test Data (Gender={gender_val}) -- ' + \
                f'SDR: {results_testing[-1][2]:>6.3f} dB, ' + \
                f'\033[33mSISDR: {results_testing[-1][3]:>6.3f} dB\033[39m, ' + \
                f'MSE: {results_testing[-1][4]:>6.3f}, ' + \
                f'BCE: {results_testing[-1][5]:>6.3f}, ' + \
                f'Accuracy: {results_testing[-1][6]:>6.3f}'
            )
            logging.info(status)
        F.write_table(filename=os.path.join(output_directory, f'test_results.txt'),
                      table_data=results_testing, headers=fields)
    return
Esempio n. 3
0
def evaluation():
    with torch.no_grad():

        #
        # initialize network
        #
        np.random.seed(0)
        torch.manual_seed(0)

        network = GatingNetwork(
            args.hidden_size,
            args.num_layers,
            args.num_clusters,
        ).to(device=args.device_id)

        network_params = F.count_parameters(network)

        network.load_state_dict(torch.load(
            args.state_dict_file,
            map_location=torch.device(args.device_id),
        ),
                                strict=False)
        network.eval()

        F.write_data(filename=os.path.join(output_directory,
                                           'num_parameters.txt'),
                     data=network_params)

        with torch.cuda.device(args.device_id):
            torch.cuda.empty_cache()

        if args.latent_space == 'gender':

            te_accuracy = {str(k): 0 for k in C.gender_all}
            for te_gender in C.gender_all:

                te_batch_durations = list()
                te_batch_accuracy = list()
                files_speech = np.random.choice(F.filter_by_gender(
                    te_utterances, te_gender),
                                                size=C.te_batch_size)
                files_noise = np.random.choice(te_noises, size=C.te_batch_size)

                for (i, fs, fn) in zip(range(C.te_batch_size), files_speech,
                                       files_noise):

                    source = F.load_audio(fs,
                                          duration=None,
                                          random_offset=False,
                                          device=args.device_id)
                    noise = F.load_audio(fn,
                                         duration=None,
                                         random_offset=False,
                                         device=args.device_id)
                    min_length = min(len(source), len(noise))
                    stft_frames = ceil(min_length / C.hop_size)
                    source = source[:min_length]
                    noise = noise[:min_length]

                    (x, s, n) = F.mix_signals(source, noise, snr_db=C.snr_all)
                    (X, X_mag) = F.stft(x)

                    X_mag = X_mag.permute(
                        1, 0)[:stft_frames]  # (seq_len, num_features)
                    X_mag = torch.unsqueeze(X_mag, dim=0)

                    Y = torch.zeros(1,
                                    len(C.gender_all),
                                    device=args.device_id)
                    gender_index = int(te_gender == 'F')
                    Y[..., gender_index] = 1

                    # forward pass
                    Y_hat = network(X_mag)

                    te_batch_accuracy.append(F.calculate_accuracy(Y, Y_hat))
                    te_batch_durations.append(min_length)

                # store the weighted average results
                te_accuracy[str(te_gender)] = np.average(
                    te_batch_accuracy, weights=te_batch_durations)

        elif args.latent_space == 'snr':

            te_accuracy = {str(k): 0 for k in C.snr_all}
            for te_snr in C.snr_all:

                te_batch_durations = list()
                te_batch_accuracy = list()
                files_speech = np.random.choice(te_utterances,
                                                size=C.te_batch_size)
                files_noise = np.random.choice(te_noises, size=C.te_batch_size)

                for (i, fs, fn) in zip(range(C.te_batch_size), files_speech,
                                       files_noise):

                    source = F.load_audio(fs,
                                          duration=None,
                                          random_offset=False,
                                          device=args.device_id)
                    noise = F.load_audio(fn,
                                         duration=None,
                                         random_offset=False,
                                         device=args.device_id)
                    min_length = min(len(source), len(noise))
                    stft_frames = ceil(min_length / C.hop_size)
                    source = source[:min_length]
                    noise = noise[:min_length]

                    (x, s, n) = F.mix_signals(source, noise, snr_db=te_snr)
                    (X, X_mag) = F.stft(x)

                    X_mag = X_mag.permute(
                        1, 0)[:stft_frames]  # (seq_len, num_features)
                    X_mag = torch.unsqueeze(X_mag, dim=0)

                    actual_sdr = float(F.calculate_sdr(s, x).item())
                    sdr_index = int(np.abs(C.snr_all - actual_sdr).argmin())
                    Y = torch.zeros(1, len(C.snr_all), device=args.device_id)
                    Y[..., sdr_index] = 1

                    # forward pass
                    Y_hat = network(X_mag)

                    te_batch_accuracy.append(F.calculate_accuracy(Y, Y_hat))
                    te_batch_durations.append(min_length)

                # store the weighted average results
                te_accuracy[str(te_snr)] = np.average(
                    te_batch_accuracy, weights=te_batch_durations)

        te_accuracy['mean'] = np.mean(list(te_accuracy.values()))

        logging.info(json.dumps(te_accuracy, sort_keys=True, indent=4))
        F.write_data(filename=os.path.join(output_directory,
                                           f'test_results.txt'),
                     data=te_accuracy)

    return
Esempio n. 4
0
        for mixture_snr in te_snrs:
            if not (mixture_snr == -5):
                continue

            # mix the signals up
            source = F.load_audio(fs,
                                  device=args.device_id,
                                  random_offset=False,
                                  duration=None)
            noise = F.load_audio(fn,
                                 device=args.device_id,
                                 random_offset=False,
                                 duration=None)
            min_length = min(len(source), len(noise))
            (x, s, n) = F.mix_signals(source[:min_length],
                                      noise[:min_length],
                                      snr_db=mixture_snr)
            (S, S_mag) = F.stft(s)
            (N, N_mag) = F.stft(n)
            (X, X_mag) = F.stft(x)
            (M) = F.calculate_masking_target(S_mag, N_mag)
            X = X.permute(1, 0, 2)
            S_mag = S_mag.permute(1, 0)
            N_mag = N_mag.permute(1, 0)
            X_mag = X_mag.permute(1, 0)
            M = M.permute(1, 0)
            actual_sdr = float(F.calculate_sdr(s, x).item())
            actual_sisdr = float(F.calculate_sisdr(s, x).item())

            # inference
            M = network(X_mag.unsqueeze(0))
Esempio n. 5
0
def evaluation():
    with torch.no_grad():

        #
        # initialize network
        #
        np.random.seed(0)
        torch.manual_seed(0)

        network = DenoisingNetwork(args.hidden_size,
                                   args.num_layers).to(device=args.device_id)

        network_params = F.count_parameters(network)

        network.load_state_dict(torch.load(
            args.state_dict_file,
            map_location=torch.device(args.device_id),
        ),
                                strict=True)
        network.eval()

        F.write_data(filename=os.path.join(output_directory,
                                           'num_parameters.txt'),
                     data=network_params)

        with torch.cuda.device(args.device_id):
            torch.cuda.empty_cache()

        te_sisdr = dict()

        if args.latent_space in ('gender', 'all'):
            np.random.seed(0)
            torch.manual_seed(0)

            for te_gender in C.gender_all:

                logging.info(
                    f'Now testing model with {te_gender}-gender inputs...')

                te_batch_durations = list()
                te_batch_sisdr = list()
                files_speech = np.random.choice(F.filter_by_gender(
                    te_utterances, te_gender),
                                                size=C.te_batch_size)
                files_noise = np.random.choice(te_noises, size=C.te_batch_size)

                for (i, fs, fn) in zip(range(C.te_batch_size), files_speech,
                                       files_noise):

                    source = F.load_audio(fs,
                                          duration=None,
                                          random_offset=False,
                                          device=args.device_id)
                    noise = F.load_audio(fn,
                                         duration=None,
                                         random_offset=False,
                                         device=args.device_id)
                    min_length = min(len(source), len(noise))
                    stft_frames = ceil(min_length / C.hop_size)
                    source = source[:min_length]
                    noise = noise[:min_length]

                    (x, s, n) = F.mix_signals(source, noise, snr_db=C.snr_all)
                    (X, X_mag) = F.stft(x)

                    X = X.permute(
                        1, 0,
                        2)[:stft_frames]  # (seq_len, num_features, channel)
                    X_mag = X_mag.permute(
                        1, 0)[:stft_frames]  # (seq_len, num_features)
                    X = torch.unsqueeze(X, dim=0)
                    X_mag = torch.unsqueeze(X_mag, dim=0)
                    s = torch.unsqueeze(s, dim=0)
                    x = torch.unsqueeze(x, dim=0)

                    actual_sisdr = float(F.calculate_sisdr(s, x).item())

                    # feed-forward
                    M_hat = network(X_mag)
                    s_hat = F.istft(X, mask=M_hat)

                    te_batch_sisdr.append(
                        (F.calculate_sisdr(s, s_hat,
                                           offset=actual_sisdr).mean().item()))
                    te_batch_durations.append(min_length)

                # store the weighted average results
                te_sisdr[str(te_gender)] = np.average(
                    te_batch_sisdr, weights=te_batch_durations)

            te_sisdr['mean_gender'] = np.mean(
                [te_sisdr[str(x)] for x in C.gender_all])

        if args.latent_space in ('snr', 'all'):
            np.random.seed(0)
            torch.manual_seed(0)

            for te_snr in C.snr_all:

                logging.info(
                    f'Now testing model with {te_snr} dB mixture SDR inputs...'
                )

                te_batch_durations = list()
                te_batch_sisdr = list()
                files_speech = np.random.choice(te_utterances,
                                                size=C.te_batch_size)
                files_noise = np.random.choice(te_noises, size=C.te_batch_size)

                for (i, fs, fn) in zip(range(C.te_batch_size), files_speech,
                                       files_noise):

                    source = F.load_audio(fs,
                                          duration=None,
                                          random_offset=False,
                                          device=args.device_id)
                    noise = F.load_audio(fn,
                                         duration=None,
                                         random_offset=False,
                                         device=args.device_id)
                    min_length = min(len(source), len(noise))
                    stft_frames = ceil(min_length / C.hop_size)
                    source = source[:min_length]
                    noise = noise[:min_length]

                    (x, s, n) = F.mix_signals(source, noise, snr_db=te_snr)
                    (X, X_mag) = F.stft(x)

                    X = X.permute(
                        1, 0,
                        2)[:stft_frames]  # (seq_len, num_features, channel)
                    X_mag = X_mag.permute(
                        1, 0)[:stft_frames]  # (seq_len, num_features)
                    X = torch.unsqueeze(X, dim=0)
                    X_mag = torch.unsqueeze(X_mag, dim=0)
                    s = torch.unsqueeze(s, dim=0)
                    x = torch.unsqueeze(x, dim=0)

                    actual_sisdr = float(F.calculate_sisdr(s, x).item())

                    # feed-forward
                    M_hat = network(X_mag)
                    s_hat = F.istft(X, mask=M_hat)

                    te_batch_sisdr.append(
                        (F.calculate_sisdr(s, s_hat,
                                           offset=actual_sisdr).mean().item()))
                    te_batch_durations.append(min_length)

                # store the weighted average results
                te_sisdr[str(te_snr)] = np.average(te_batch_sisdr,
                                                   weights=te_batch_durations)

            te_sisdr['mean_sisdr'] = np.mean(
                [te_sisdr[str(x)] for x in C.snr_all])

        logging.info(json.dumps(te_sisdr, sort_keys=True, indent=4))
        F.write_data(filename=os.path.join(output_directory,
                                           f'test_results.txt'),
                     data=te_sisdr)

    return