コード例 #1
0
def evaluation():
    with torch.no_grad():

        #
        # initialize network
        #
        np.random.seed(0)
        torch.manual_seed(0)

        network_params = 0

        network = EnsembleNetwork(
            filepath_gating=file_gating,
            filepaths_denoising=files_specialists,
            g_hs=hidden_size_gating,
            g_nl=num_layers_gating,
            s_hs=hidden_size_specialist,
            s_nl=num_layers_specialist,
            ct=args.latent_space,
        ).to(device=args.device_id)

        F.write_data(filename=os.path.join(output_directory,
                                           'files_gating.txt'),
                     data=str(file_gating))
        F.write_data(filename=os.path.join(output_directory,
                                           'files_specialist.txt'),
                     data=str(files_specialists))

        with torch.cuda.device(args.device_id):
            torch.cuda.empty_cache()

        if args.latent_space == 'gender':

            te_sisdr = {str(k): 0 for k in C.gender_all}
            for te_gender in C.gender_all:

                te_batch_durations = list()
                te_batch_sisdr = list()
                files_speech = np.random.choice(F.filter_by_gender(
                    te_utterances, te_gender),
                                                size=C.te_batch_size)
                files_noise = np.random.choice(te_noises, size=C.te_batch_size)

                for (i, fs, fn) in zip(range(C.te_batch_size), files_speech,
                                       files_noise):

                    source = F.load_audio(fs,
                                          duration=None,
                                          random_offset=False,
                                          device=args.device_id)
                    noise = F.load_audio(fn,
                                         duration=None,
                                         random_offset=False,
                                         device=args.device_id)
                    min_length = min(len(source), len(noise))
                    stft_frames = ceil(min_length / C.hop_size)
                    source = source[:min_length]
                    noise = noise[:min_length]

                    (x, s, n) = F.mix_signals(source, noise, snr_db=C.snr_all)
                    (X, X_mag) = F.stft(x)

                    X = X.permute(
                        1, 0,
                        2)[:stft_frames]  # (seq_len, num_features, channel)
                    X_mag = X_mag.permute(
                        1, 0)[:stft_frames]  # (seq_len, num_features)
                    X = torch.unsqueeze(X, dim=0)
                    X_mag = torch.unsqueeze(X_mag, dim=0)
                    s = torch.unsqueeze(s, dim=0)
                    x = torch.unsqueeze(x, dim=0)

                    actual_sisdr = float(F.calculate_sisdr(s, x).item())

                    # feed-forward
                    M_hat = network(X_mag)
                    s_hat = F.istft(X, mask=M_hat)

                    te_batch_sisdr.append(
                        (F.calculate_sisdr(s, s_hat,
                                           offset=actual_sisdr).mean().item()))
                    te_batch_durations.append(min_length)

                # store the weighted average results
                te_sisdr[str(te_gender)] = np.average(
                    te_batch_sisdr, weights=te_batch_durations)

        elif args.latent_space == 'snr':

            te_sisdr = {str(k): 0 for k in C.snr_all}
            for te_snr in C.snr_all:

                te_batch_durations = list()
                te_batch_sisdr = list()
                files_speech = np.random.choice(te_utterances,
                                                size=C.te_batch_size)
                files_noise = np.random.choice(te_noises, size=C.te_batch_size)

                for (i, fs, fn) in zip(range(C.te_batch_size), files_speech,
                                       files_noise):

                    source = F.load_audio(fs,
                                          duration=None,
                                          random_offset=False,
                                          device=args.device_id)
                    noise = F.load_audio(fn,
                                         duration=None,
                                         random_offset=False,
                                         device=args.device_id)
                    min_length = min(len(source), len(noise))
                    stft_frames = ceil(min_length / C.hop_size)
                    source = source[:min_length]
                    noise = noise[:min_length]

                    (x, s, n) = F.mix_signals(source, noise, snr_db=te_snr)
                    (X, X_mag) = F.stft(x)

                    X = X.permute(
                        1, 0,
                        2)[:stft_frames]  # (seq_len, num_features, channel)
                    X_mag = X_mag.permute(
                        1, 0)[:stft_frames]  # (seq_len, num_features)
                    X = torch.unsqueeze(X, dim=0)
                    X_mag = torch.unsqueeze(X_mag, dim=0)
                    s = torch.unsqueeze(s, dim=0)
                    x = torch.unsqueeze(x, dim=0)

                    actual_sisdr = float(F.calculate_sisdr(s, x).item())

                    # feed-forward
                    M_hat = network(X_mag)
                    s_hat = F.istft(X, mask=M_hat)

                    te_batch_sisdr.append(
                        (F.calculate_sisdr(s, s_hat,
                                           offset=actual_sisdr).mean().item()))
                    te_batch_durations.append(min_length)

                # store the weighted average results
                te_sisdr[str(te_snr)] = np.average(te_batch_sisdr,
                                                   weights=te_batch_durations)

        te_sisdr['mean'] = np.mean(list(te_sisdr.values()))

        logging.info(json.dumps(te_sisdr, sort_keys=True, indent=4))
        F.write_data(filename=os.path.join(output_directory,
                                           f'test_results.txt'),
                     data=te_sisdr)

    return
コード例 #2
0
ファイル: train_gating.py プロジェクト: mtxing/sparse_mle
def experiment():

    #
    # initialize network
    #
    np.random.seed(0)
    torch.manual_seed(0)

    network = M.GatingNetwork(
        args.hidden_size,
        args.num_layers,
        C.num_clusters[args.latent_space],
    ).to(device=args.device_id)

    network_params = F.count_parameters(network)

    optimizer = torch.optim.Adam(
        params=network.parameters(),
        lr=args.learning_rate,
    )

    criterion = torch.nn.BCELoss()

    F.write_data(filename=os.path.join(output_directory, 'num_parameters.txt'),
                 data=network_params)

    with torch.cuda.device(args.device_id):
        torch.cuda.empty_cache()


    #
    # log experiment configuration
    #
    os.system('cls' if os.name == 'nt' else 'clear')
    logging.info(f'Training Gating network for {args.latent_space}-based clustering...')
    logging.info(f'\u2022 {args.hidden_size} hidden units')
    logging.info(f'\u2022 {args.num_layers} layers')
    logging.info(f'\u2022 {network_params} learnable parameters')
    logging.info(f'\u2022 {args.learning_rate:.3e} learning rate')
    logging.info(f'Results will be saved in "{output_directory}".')
    logging.info(f'Using GPU device {args.device_id}...')


    #
    # experiment loop
    #
    (iteration, iteration_best) = (0, 0)
    accuracy_best = 0

    while not C.stopping_criteria(iteration, iteration_best):

        network.train()
        np.random.seed(iteration)
        torch.manual_seed(iteration)

        # training
        for batch_index in range(100):

            # forward propagation
            batch = F.generate_batch(
                np.random.choice(tr_utterances, size=C.tr_batch_size),
                np.random.choice(tr_noises, size=C.tr_batch_size),
                device=args.device_id,
            )
            Y_hat = network(batch.X_mag)

            # prepare targets
            if args.latent_space == 'gender':
                Y = batch.index_gender
            elif args.latent_space == 'snr':
                Y = batch.index_sdr

            # backward propagation
            optimizer.zero_grad()
            criterion(Y_hat, Y).backward()
            torch.nn.utils.clip_grad_norm_(network.parameters(), max_norm=1e-4)
            optimizer.step()

        network.eval()
        np.random.seed(0)
        torch.manual_seed(0)

        # validation
        with torch.no_grad():

            if args.latent_space == 'gender':

                accuracy = {k: 0 for k in C.gender_all}
                for vl_gender in C.gender_all:

                    vl_filtered_files = F.filter_by_gender(
                        vl_utterances,
                        vl_gender
                    )
                    batch = F.generate_batch(
                        np.random.choice(vl_filtered_files, size=C.vl_batch_size),
                        np.random.choice(vl_noises, size=C.vl_batch_size),
                        device=args.device_id,
                    )
                    Y_hat = network(batch.X_mag)
                    Y = batch.index_gender
                    accuracy[vl_gender] = F.calculate_accuracy(Y, Y_hat)

            elif args.latent_space == 'snr':

                accuracy = {k: 0 for k in C.snr_all}
                for vl_snr in C.snr_all:

                    batch = F.generate_batch(
                        np.random.choice(vl_utterances, size=C.vl_batch_size),
                        np.random.choice(vl_noises, size=C.vl_batch_size),
                        mixture_snr=vl_snr,
                        device=args.device_id,
                    )
                    Y_hat = network(batch.X_mag)
                    Y = batch.index_sdr
                    accuracy[vl_snr] = F.calculate_accuracy(Y, Y_hat)

        accuracy['mean'] = np.mean(list(accuracy.values()))

        # print results
        if accuracy['mean'] > accuracy_best:
            accuracy_best = accuracy['mean']
            iteration_best = iteration

            F.write_data(
                filename=os.path.join(output_directory, 'validation_accuracy.txt'),
                data=f'{accuracy_best:%}'
            )
            torch.save(network.state_dict(), os.path.join(output_directory, 'model.pt'))
            checkmark = ' | \033[32m\u2714\033[39m'
        else:
            checkmark = ''

        status = ''
        for (k, v) in accuracy.items():
            status += f'\033[33m{k}: {v:>8.3%}\033[39m, '
        ts_end = int(round(time.time())) - ts_start
        status += f'Time Elapsed: {int(ts_end/60)} minutes' + checkmark
        logging.info(status)
        iteration += 1

    return
コード例 #3
0
ファイル: train_denoising.py プロジェクト: mtxing/sparse_mle
def experiment():

    #
    # initialize network
    #
    np.random.seed(0)
    torch.manual_seed(0)

    network = M.DenoisingNetwork(args.hidden_size,
                                 args.num_layers).to(device=args.device_id)

    network_params = F.count_parameters(network)

    optimizer = torch.optim.Adam(
        params=network.parameters(),
        lr=args.learning_rate,
    )

    criterion = F.loss_sisdr

    F.write_data(filename=os.path.join(output_directory, 'num_parameters.txt'),
                 data=network_params)

    with torch.cuda.device(args.device_id):
        torch.cuda.empty_cache()

    #
    # log experiment configuration
    #
    os.system('cls' if os.name == 'nt' else 'clear')
    logging.info(f'Training Denoising network' + (
        f' specializing in {F.fmt_specialty(args.specialization)} mixtures'
        if args.specialization else '') + '...')
    logging.info(f'\u2022 {args.hidden_size} hidden units')
    logging.info(f'\u2022 {args.num_layers} layers')
    logging.info(f'\u2022 {network_params} learnable parameters')
    logging.info(f'\u2022 {args.learning_rate:.3e} learning rate')
    logging.info(f'Results will be saved in "{output_directory}".')
    logging.info(f'Using GPU device {args.device_id}...')

    #
    # experiment loop
    #
    (iteration, iteration_best) = (0, 0)
    sisdr_best = 0

    while not C.stopping_criteria(iteration, iteration_best):

        network.train()
        np.random.seed(iteration)
        torch.manual_seed(iteration)

        # training
        for batch_index in range(100):

            # forward propagation
            batch = F.generate_batch(
                np.random.choice(tr_utterances, size=C.tr_batch_size),
                np.random.choice(tr_noises, size=C.tr_batch_size),
                mixture_snr=tr_snr,
                device=args.device_id,
            )
            M_hat = network(batch.X_mag)
            s_hat = F.istft(batch.X, mask=M_hat)

            # backward propagation
            optimizer.zero_grad()
            F.loss_sisdr(batch.s, s_hat).backward()
            torch.nn.utils.clip_grad_norm_(network.parameters(), max_norm=1e-4)
            optimizer.step()

        network.eval()
        np.random.seed(0)
        torch.manual_seed(0)

        # validation
        with torch.no_grad():

            if args.latent_space == 'gender':

                sisdr_batch = {k: 0 for k in C.gender_all}
                for vl_gender in C.gender_all:

                    vl_filtered_files = F.filter_by_gender(
                        vl_utterances, vl_gender)
                    batch = F.generate_batch(
                        np.random.choice(vl_filtered_files,
                                         size=C.vl_batch_size),
                        np.random.choice(vl_noises, size=C.vl_batch_size),
                        device=args.device_id,
                    )
                    M_hat = network(batch.X_mag)
                    s_hat = F.istft(batch.X, mask=M_hat)
                    sisdr_batch[vl_gender] = float(
                        F.calculate_sisdr(
                            batch.s, s_hat,
                            offset=batch.actual_sisdr).mean().item())

            else:

                sisdr_batch = {k: 0 for k in C.snr_all}
                for vl_snr in C.snr_all:

                    batch = F.generate_batch(
                        np.random.choice(vl_utterances, size=C.vl_batch_size),
                        np.random.choice(vl_noises, size=C.vl_batch_size),
                        mixture_snr=vl_snr,
                        device=args.device_id,
                    )
                    M_hat = network(batch.X_mag)
                    s_hat = F.istft(batch.X, mask=M_hat)
                    sisdr_batch[vl_snr] = float(
                        F.calculate_sisdr(
                            batch.s, s_hat,
                            offset=batch.actual_sisdr).mean().item())

        sisdr_batch['mean'] = np.mean(list(sisdr_batch.values()))

        # print results
        if sisdr_batch['mean' if args.
                       latent_space != 'snr' else tr_snr] > sisdr_best:
            sisdr_best = sisdr_batch['mean' if args.
                                     latent_space != 'snr' else tr_snr]
            iteration_best = iteration

            F.write_data(filename=os.path.join(output_directory,
                                               'validation_sisdr.txt'),
                         data=f'{sisdr_best:%}')
            torch.save(network.state_dict(),
                       os.path.join(output_directory, 'model.pt'))
            checkmark = ' | \033[32m\u2714\033[39m'
        else:
            checkmark = ''

        status = ''
        for (k, v) in sisdr_batch.items():
            status += f'\033[33m{k}: {v:>6.3f} dB\033[39m, '
        ts_end = int(round(time.time())) - ts_start
        status += f'Time Elapsed: {int(ts_end/60)} minutes' + checkmark
        logging.info(status)
        iteration += 1

    return
コード例 #4
0
ファイル: train_denoising.py プロジェクト: mtxing/sparse_mle
#
filepaths = np.load('filepaths.npy', allow_pickle=True)
(tr_utterances, tr_noises) = filepaths[0:2]
(vl_utterances, vl_noises) = filepaths[2:4]

(tr_snr, tr_gender) = (C.snr_all, C.gender_all)
args.latent_space = None
if args.specialization is not None:
    if args.specialization.isnumeric():
        assert int(args.specialization) in set(C.snr_all)
        args.latent_space = 'snr'
        tr_snr = int(args.specialization)
    elif args.specialization.isalpha():
        assert str(args.specialization) in set(C.gender_all)
        args.latent_space = 'gender'
        tr_utterances = F.filter_by_gender(tr_utterances,
                                           str(args.specialization))

#
# experiment
#


def experiment():

    #
    # initialize network
    #
    np.random.seed(0)
    torch.manual_seed(0)

    network = M.DenoisingNetwork(args.hidden_size,
コード例 #5
0
def evaluation():
    with torch.no_grad():

        #
        # initialize network
        #
        np.random.seed(0)
        torch.manual_seed(0)

        network = GatingNetwork(
            args.hidden_size,
            args.num_layers,
            args.num_clusters,
        ).to(device=args.device_id)

        network_params = F.count_parameters(network)

        network.load_state_dict(torch.load(
            args.state_dict_file,
            map_location=torch.device(args.device_id),
        ),
                                strict=False)
        network.eval()

        F.write_data(filename=os.path.join(output_directory,
                                           'num_parameters.txt'),
                     data=network_params)

        with torch.cuda.device(args.device_id):
            torch.cuda.empty_cache()

        if args.latent_space == 'gender':

            te_accuracy = {str(k): 0 for k in C.gender_all}
            for te_gender in C.gender_all:

                te_batch_durations = list()
                te_batch_accuracy = list()
                files_speech = np.random.choice(F.filter_by_gender(
                    te_utterances, te_gender),
                                                size=C.te_batch_size)
                files_noise = np.random.choice(te_noises, size=C.te_batch_size)

                for (i, fs, fn) in zip(range(C.te_batch_size), files_speech,
                                       files_noise):

                    source = F.load_audio(fs,
                                          duration=None,
                                          random_offset=False,
                                          device=args.device_id)
                    noise = F.load_audio(fn,
                                         duration=None,
                                         random_offset=False,
                                         device=args.device_id)
                    min_length = min(len(source), len(noise))
                    stft_frames = ceil(min_length / C.hop_size)
                    source = source[:min_length]
                    noise = noise[:min_length]

                    (x, s, n) = F.mix_signals(source, noise, snr_db=C.snr_all)
                    (X, X_mag) = F.stft(x)

                    X_mag = X_mag.permute(
                        1, 0)[:stft_frames]  # (seq_len, num_features)
                    X_mag = torch.unsqueeze(X_mag, dim=0)

                    Y = torch.zeros(1,
                                    len(C.gender_all),
                                    device=args.device_id)
                    gender_index = int(te_gender == 'F')
                    Y[..., gender_index] = 1

                    # forward pass
                    Y_hat = network(X_mag)

                    te_batch_accuracy.append(F.calculate_accuracy(Y, Y_hat))
                    te_batch_durations.append(min_length)

                # store the weighted average results
                te_accuracy[str(te_gender)] = np.average(
                    te_batch_accuracy, weights=te_batch_durations)

        elif args.latent_space == 'snr':

            te_accuracy = {str(k): 0 for k in C.snr_all}
            for te_snr in C.snr_all:

                te_batch_durations = list()
                te_batch_accuracy = list()
                files_speech = np.random.choice(te_utterances,
                                                size=C.te_batch_size)
                files_noise = np.random.choice(te_noises, size=C.te_batch_size)

                for (i, fs, fn) in zip(range(C.te_batch_size), files_speech,
                                       files_noise):

                    source = F.load_audio(fs,
                                          duration=None,
                                          random_offset=False,
                                          device=args.device_id)
                    noise = F.load_audio(fn,
                                         duration=None,
                                         random_offset=False,
                                         device=args.device_id)
                    min_length = min(len(source), len(noise))
                    stft_frames = ceil(min_length / C.hop_size)
                    source = source[:min_length]
                    noise = noise[:min_length]

                    (x, s, n) = F.mix_signals(source, noise, snr_db=te_snr)
                    (X, X_mag) = F.stft(x)

                    X_mag = X_mag.permute(
                        1, 0)[:stft_frames]  # (seq_len, num_features)
                    X_mag = torch.unsqueeze(X_mag, dim=0)

                    actual_sdr = float(F.calculate_sdr(s, x).item())
                    sdr_index = int(np.abs(C.snr_all - actual_sdr).argmin())
                    Y = torch.zeros(1, len(C.snr_all), device=args.device_id)
                    Y[..., sdr_index] = 1

                    # forward pass
                    Y_hat = network(X_mag)

                    te_batch_accuracy.append(F.calculate_accuracy(Y, Y_hat))
                    te_batch_durations.append(min_length)

                # store the weighted average results
                te_accuracy[str(te_snr)] = np.average(
                    te_batch_accuracy, weights=te_batch_durations)

        te_accuracy['mean'] = np.mean(list(te_accuracy.values()))

        logging.info(json.dumps(te_accuracy, sort_keys=True, indent=4))
        F.write_data(filename=os.path.join(output_directory,
                                           f'test_results.txt'),
                     data=te_accuracy)

    return
コード例 #6
0
ファイル: test_denoising.py プロジェクト: mtxing/sparse_mle
def evaluation():
    with torch.no_grad():

        #
        # initialize network
        #
        np.random.seed(0)
        torch.manual_seed(0)

        network = DenoisingNetwork(args.hidden_size,
                                   args.num_layers).to(device=args.device_id)

        network_params = F.count_parameters(network)

        network.load_state_dict(torch.load(
            args.state_dict_file,
            map_location=torch.device(args.device_id),
        ),
                                strict=True)
        network.eval()

        F.write_data(filename=os.path.join(output_directory,
                                           'num_parameters.txt'),
                     data=network_params)

        with torch.cuda.device(args.device_id):
            torch.cuda.empty_cache()

        te_sisdr = dict()

        if args.latent_space in ('gender', 'all'):
            np.random.seed(0)
            torch.manual_seed(0)

            for te_gender in C.gender_all:

                logging.info(
                    f'Now testing model with {te_gender}-gender inputs...')

                te_batch_durations = list()
                te_batch_sisdr = list()
                files_speech = np.random.choice(F.filter_by_gender(
                    te_utterances, te_gender),
                                                size=C.te_batch_size)
                files_noise = np.random.choice(te_noises, size=C.te_batch_size)

                for (i, fs, fn) in zip(range(C.te_batch_size), files_speech,
                                       files_noise):

                    source = F.load_audio(fs,
                                          duration=None,
                                          random_offset=False,
                                          device=args.device_id)
                    noise = F.load_audio(fn,
                                         duration=None,
                                         random_offset=False,
                                         device=args.device_id)
                    min_length = min(len(source), len(noise))
                    stft_frames = ceil(min_length / C.hop_size)
                    source = source[:min_length]
                    noise = noise[:min_length]

                    (x, s, n) = F.mix_signals(source, noise, snr_db=C.snr_all)
                    (X, X_mag) = F.stft(x)

                    X = X.permute(
                        1, 0,
                        2)[:stft_frames]  # (seq_len, num_features, channel)
                    X_mag = X_mag.permute(
                        1, 0)[:stft_frames]  # (seq_len, num_features)
                    X = torch.unsqueeze(X, dim=0)
                    X_mag = torch.unsqueeze(X_mag, dim=0)
                    s = torch.unsqueeze(s, dim=0)
                    x = torch.unsqueeze(x, dim=0)

                    actual_sisdr = float(F.calculate_sisdr(s, x).item())

                    # feed-forward
                    M_hat = network(X_mag)
                    s_hat = F.istft(X, mask=M_hat)

                    te_batch_sisdr.append(
                        (F.calculate_sisdr(s, s_hat,
                                           offset=actual_sisdr).mean().item()))
                    te_batch_durations.append(min_length)

                # store the weighted average results
                te_sisdr[str(te_gender)] = np.average(
                    te_batch_sisdr, weights=te_batch_durations)

            te_sisdr['mean_gender'] = np.mean(
                [te_sisdr[str(x)] for x in C.gender_all])

        if args.latent_space in ('snr', 'all'):
            np.random.seed(0)
            torch.manual_seed(0)

            for te_snr in C.snr_all:

                logging.info(
                    f'Now testing model with {te_snr} dB mixture SDR inputs...'
                )

                te_batch_durations = list()
                te_batch_sisdr = list()
                files_speech = np.random.choice(te_utterances,
                                                size=C.te_batch_size)
                files_noise = np.random.choice(te_noises, size=C.te_batch_size)

                for (i, fs, fn) in zip(range(C.te_batch_size), files_speech,
                                       files_noise):

                    source = F.load_audio(fs,
                                          duration=None,
                                          random_offset=False,
                                          device=args.device_id)
                    noise = F.load_audio(fn,
                                         duration=None,
                                         random_offset=False,
                                         device=args.device_id)
                    min_length = min(len(source), len(noise))
                    stft_frames = ceil(min_length / C.hop_size)
                    source = source[:min_length]
                    noise = noise[:min_length]

                    (x, s, n) = F.mix_signals(source, noise, snr_db=te_snr)
                    (X, X_mag) = F.stft(x)

                    X = X.permute(
                        1, 0,
                        2)[:stft_frames]  # (seq_len, num_features, channel)
                    X_mag = X_mag.permute(
                        1, 0)[:stft_frames]  # (seq_len, num_features)
                    X = torch.unsqueeze(X, dim=0)
                    X_mag = torch.unsqueeze(X_mag, dim=0)
                    s = torch.unsqueeze(s, dim=0)
                    x = torch.unsqueeze(x, dim=0)

                    actual_sisdr = float(F.calculate_sisdr(s, x).item())

                    # feed-forward
                    M_hat = network(X_mag)
                    s_hat = F.istft(X, mask=M_hat)

                    te_batch_sisdr.append(
                        (F.calculate_sisdr(s, s_hat,
                                           offset=actual_sisdr).mean().item()))
                    te_batch_durations.append(min_length)

                # store the weighted average results
                te_sisdr[str(te_snr)] = np.average(te_batch_sisdr,
                                                   weights=te_batch_durations)

            te_sisdr['mean_sisdr'] = np.mean(
                [te_sisdr[str(x)] for x in C.snr_all])

        logging.info(json.dumps(te_sisdr, sort_keys=True, indent=4))
        F.write_data(filename=os.path.join(output_directory,
                                           f'test_results.txt'),
                     data=te_sisdr)

    return
コード例 #7
0
def experiment():

    #
    # ensure reproducibility
    #
    np.random.seed(0)
    torch.manual_seed(0)

    #
    # initialize network
    #
    network = EnsembleNetwork(
        filepath_gating=file_gating,
        filepaths_denoising=files_specialists,
        g_hs=hidden_size_gating,
        g_nl=num_layers_gating,
        s_hs=hidden_size_specialist,
        s_nl=num_layers_specialist,
        ct=args.latent_space,
    ).to(device=args.device_id)

    optimizer = torch.optim.Adam(
        params=network.parameters(),
        lr=args.learning_rate,
    )

    network_params = F.count_parameters(network.gating) + F.count_parameters(
        network.specialists[0])

    F.write_data(filename=os.path.join(output_directory, 'num_parameters.txt'),
                 data=network_params)
    F.write_data(filename=os.path.join(output_directory, 'files_gating.txt'),
                 data=file_gating)
    F.write_data(filename=os.path.join(output_directory,
                                       'files_specialist.txt'),
                 data=files_specialists)
    with torch.cuda.device(args.device_id):
        torch.cuda.empty_cache()

    #
    # log experiment configuration
    #
    os.system('cls' if os.name == 'nt' else 'clear')
    logging.info(
        f'Training Ensemble network composed of {args.latent_space} specialists...'
    )
    logging.info(f'\u2022 {architecture_gating} gating architecture')
    logging.info(f'\u2022 {architecture_specialist} specialist architecture')
    logging.info(
        f'\u2022 Softmax annealing strategy = {args.softmax_annealing if args.softmax_annealing else None}'
    )
    logging.info(f'\u2022 {network_params} learnable parameters')
    logging.info(f'\u2022 {args.learning_rate:.3e} learning rate')
    logging.info(f'Results will be saved in "{output_directory}".')
    logging.info(f'Using GPU device {args.device_id}...')

    # softmax_annealing
    # experiment loop
    #
    (iteration, iteration_best) = (0, 0)
    sisdr_best = 0

    while not C.stopping_criteria(iteration, iteration_best):

        network.train()
        np.random.seed(iteration)
        torch.manual_seed(iteration)

        # training
        for batch_index in range(100):

            # forward propagation
            batch = F.generate_batch(
                np.random.choice(tr_utterances, size=C.tr_batch_size),
                np.random.choice(tr_noises, size=C.tr_batch_size),
                device=args.device_id,
            )
            M_hat = network(batch.X_mag, args.softmax_annealing)
            s_hat = F.istft(batch.X, mask=M_hat)

            # backward propagation
            optimizer.zero_grad()
            F.loss_sisdr(batch.s, s_hat).backward()
            torch.nn.utils.clip_grad_norm_(network.parameters(), max_norm=1e-4)
            optimizer.step()

        network.eval()
        np.random.seed(0)
        torch.manual_seed(0)

        # validation
        with torch.no_grad():

            if args.latent_space == 'gender':

                sisdr_batch = {k: 0 for k in C.gender_all}
                for vl_gender in C.gender_all:

                    vl_filtered_files = F.filter_by_gender(
                        vl_utterances, vl_gender)
                    batch = F.generate_batch(
                        np.random.choice(vl_filtered_files,
                                         size=C.vl_batch_size),
                        np.random.choice(vl_noises, size=C.vl_batch_size),
                        device=args.device_id,
                    )
                    M_hat = network(batch.X_mag)
                    s_hat = F.istft(batch.X, mask=M_hat)
                    sisdr_batch[vl_gender] = float(
                        F.calculate_sisdr(
                            batch.s, s_hat,
                            offset=batch.actual_sisdr).mean().item())

            else:

                sisdr_batch = {k: 0 for k in C.snr_all}
                for vl_snr in C.snr_all:

                    batch = F.generate_batch(
                        np.random.choice(vl_utterances, size=C.vl_batch_size),
                        np.random.choice(vl_noises, size=C.vl_batch_size),
                        mixture_snr=vl_snr,
                        device=args.device_id,
                    )
                    M_hat = network(batch.X_mag)
                    s_hat = F.istft(batch.X, mask=M_hat)
                    sisdr_batch[vl_snr] = float(
                        F.calculate_sisdr(
                            batch.s, s_hat,
                            offset=batch.actual_sisdr).mean().item())

        sisdr_batch['mean'] = np.mean(list(sisdr_batch.values()))

        # print results
        if sisdr_batch['mean'] > sisdr_best:
            sisdr_best = sisdr_batch['mean']
            iteration_best = iteration

            F.write_data(filename=os.path.join(output_directory,
                                               'validation_sisdr.txt'),
                         data=f'{sisdr_best:%}')
            torch.save(network.state_dict(),
                       os.path.join(output_directory, 'model.pt'))
            checkmark = ' | \033[32m\u2714\033[39m'
        else:
            checkmark = ''

        status = ''
        for (k, v) in sisdr_batch.items():
            status += f'\033[33m{k}: {v:>6.3f} dB\033[39m, '
        ts_end = int(round(time.time())) - ts_start
        status += f'Time Elapsed: {int(ts_end/60)} minutes' + checkmark
        logging.info(status)
        logging.info(
            f'Network # of forwards: {network.num_forwards} \t Alpha: {network.alpha}'
        )
        iteration += 1

    return os.path.join(output_directory, 'model.pt')