Beispiel #1
0
def use_me_process(sources_list, output_file_names):
    """The usage process.

    :param sources_list: The file names to be used.
    :type sources_list: list[str]
    :param output_file_names: The output file names to be used.
    :type output_file_names: list[list[str]]
    """

    print('\n-- Welcome to MaD TwinNet.')
    if debug:
        print(
            '\n-- Cannot proceed in debug mode. Please set debug=False at the settings file.'
        )
        print('-- Exiting.')
        exit(-1)
    print(
        '-- Now I will extract the voice and the background music from the provided files'
    )

    # Masker modules
    rnn_enc = RNNEnc(hyper_parameters['reduced_dim'],
                     hyper_parameters['context_length'], debug)
    rnn_dec = RNNDec(hyper_parameters['rnn_enc_output_dim'], debug)
    fnn = FNNMasker(hyper_parameters['rnn_enc_output_dim'],
                    hyper_parameters['original_input_dim'],
                    hyper_parameters['context_length'])

    # Denoiser modules
    denoiser = FNNDenoiser(hyper_parameters['original_input_dim'])

    rnn_enc.load_state_dict(torch.load(output_states_path['rnn_enc']))
    rnn_dec.load_state_dict(torch.load(output_states_path['rnn_dec']))
    fnn.load_state_dict(torch.load(output_states_path['fnn']))
    denoiser.load_state_dict(torch.load(output_states_path['denoiser']))

    if not debug and torch.has_cudnn:
        rnn_enc = rnn_enc.cuda()
        rnn_dec = rnn_dec.cuda()
        fnn = fnn.cuda()
        denoiser = denoiser.cuda()

    testing_it = data_feeder_testing(
        window_size=hyper_parameters['window_size'],
        fft_size=hyper_parameters['fft_size'],
        hop_size=hyper_parameters['hop_size'],
        seq_length=hyper_parameters['seq_length'],
        context_length=hyper_parameters['context_length'],
        batch_size=1,
        debug=debug,
        sources_list=sources_list)

    print('-- Let\'s go!\n')
    total_time = 0

    for index, data in enumerate(testing_it()):

        s_time = time.time()

        mix, mix_magnitude, mix_phase, voice_true, bg_true = data

        voice_predicted = np.zeros(
            (mix_magnitude.shape[0], hyper_parameters['seq_length'] -
             hyper_parameters['context_length'] * 2,
             hyper_parameters['window_size']),
            dtype=np.float32)

        for batch in range(
                int(mix_magnitude.shape[0] /
                    training_constants['batch_size'])):
            b_start = batch * training_constants['batch_size']
            b_end = (batch + 1) * training_constants['batch_size']

            v_in = Variable(
                torch.from_numpy(mix_magnitude[b_start:b_end, :, :]))

            if not debug and torch.has_cudnn:
                v_in = v_in.cuda()

            tmp_voice_predicted = rnn_enc(v_in)
            tmp_voice_predicted = rnn_dec(tmp_voice_predicted)
            tmp_voice_predicted = fnn(tmp_voice_predicted, v_in)
            tmp_voice_predicted = denoiser(tmp_voice_predicted)

            voice_predicted[
                b_start:b_end, :, :] = tmp_voice_predicted.data.cpu().numpy()

        data_process_results_testing(
            index=index,
            voice_true=voice_true,
            bg_true=bg_true,
            voice_predicted=voice_predicted,
            window_size=hyper_parameters['window_size'],
            mix=mix,
            mix_magnitude=mix_magnitude,
            mix_phase=mix_phase,
            hop=hyper_parameters['hop_size'],
            context_length=hyper_parameters['context_length'],
            output_file_name=output_file_names[index])

        e_time = time.time()

        print(
            usage_output_string_per_example.format(f=sources_list[index],
                                                   t=e_time - s_time))

        total_time += e_time - s_time

    print('\n-- Testing finished\n')
    print(usage_output_string_total.format(t=total_time))
    print('-- That\'s all folks!')
Beispiel #2
0
def _testing_process(data, index, mad, device, seq_length,
                     context_length, window_size, batch_size,
                     hop_size, outputPath):
    """The testing process over testing data.

    :param data: The testing data.
    :type data: numpy.ndarray
    :param index: The index of the testing data (used for\
                  calculating scores).
    :type index: int
    :param mad: The MaD system.
    :type mad: torch.nn.Module
    :param device: The device to be used.
    :type device: str
    :param seq_length: The sequence length used.
    :type seq_length: int
    :param context_length: The context length used.
    :type context_length: int
    :param window_size: The window size used.
    :type window_size: int
    :param batch_size: The batch size used.
    :type batch_size: int
    :param hop_size: The hop size used.
    :type hop_size: int
    :return: The SDR and SIR scores, and the time elapsed for\
             the process.
    :rtype: (numpy.ndarray, numpy.ndarray, float)
    """
    s_time = time.time()

    mix, mix_magnitude, mix_phase, voice_true, bg_true = data

    voice_predicted = np.zeros((
            mix_magnitude.shape[0],
            seq_length - context_length * 2,
            window_size), dtype=np.float32)

    for batch in range(int(mix_magnitude.shape[0] / batch_size)):
        b_start = batch * batch_size
        b_end = (batch + 1) * batch_size

        v_in = from_numpy(
            mix_magnitude[b_start:b_end, :, :]).to(device)

        voice_predicted[b_start:b_end, :, :] = mad(
            v_in.unsqueeze(1)).v_j_filt.cpu().numpy()

    tmp_sdr, tmp_sir, tmp_sar = data_feeder.data_process_results_testing(
        index=index, voice_true=voice_true,
        bg_true=bg_true, voice_predicted=voice_predicted,
        window_size=window_size, mix=mix,
        mix_magnitude=mix_magnitude,
        mix_phase=mix_phase, hop=hop_size, 

        context_length=context_length, outputPath=outputPath)

    time_elapsed = time.time() - s_time

    printing.print_msg(testing_output_string_per_example.format(
        e=index,
        sdr=np.median([i for i in tmp_sdr[0] if not np.isnan(i)]),
        sir=np.median([i for i in tmp_sir[0] if not np.isnan(i)]),
        sar=np.median([i for i in tmp_sar[0] if not np.isnan(i)]),

        t=time_elapsed
    ))

    return tmp_sdr, tmp_sir,tmp_sar, time_elapsed
Beispiel #3
0
def testing_process():
    """The testing process.
    """

    device = 'cuda' if not debug and torch.cuda.is_available() else 'cpu'

    print('\n-- Starting testing process. Debug mode: {}'.format(debug))
    print('-- Process on: {}'.format(device), end='\n\n')
    print('-- Setting up modules... ', end='')

    # Masker modules
    rnn_enc = RNNEnc(hyper_parameters['reduced_dim'],
                     hyper_parameters['context_length'], debug)
    rnn_dec = RNNDec(hyper_parameters['rnn_enc_output_dim'], debug)
    fnn = FNNMasker(hyper_parameters['rnn_enc_output_dim'],
                    hyper_parameters['original_input_dim'],
                    hyper_parameters['context_length'])

    # Denoiser modules
    denoiser = FNNDenoiser(hyper_parameters['original_input_dim'])

    rnn_enc.load_state_dict(torch.load(
        output_states_path['rnn_enc'])).to(device)
    rnn_dec.load_state_dict(torch.load(
        output_states_path['rnn_dec'])).to(device)
    fnn.load_state_dict(torch.load(output_states_path['fnn'])).to(device)
    denoiser.load_state_dict(torch.load(
        output_states_path['denoiser'])).to(device)

    print('done.')

    testing_it = data_feeder_testing(
        window_size=hyper_parameters['window_size'],
        fft_size=hyper_parameters['fft_size'],
        hop_size=hyper_parameters['hop_size'],
        seq_length=hyper_parameters['seq_length'],
        context_length=hyper_parameters['context_length'],
        batch_size=1,
        debug=debug)

    print('-- Testing starts\n')

    sdr = []
    sir = []
    total_time = 0

    for index, data in enumerate(testing_it()):

        s_time = time.time()

        mix, mix_magnitude, mix_phase, voice_true, bg_true = data

        voice_predicted = np.zeros(
            (mix_magnitude.shape[0], hyper_parameters['seq_length'] -
             hyper_parameters['context_length'] * 2,
             hyper_parameters['window_size']),
            dtype=np.float32)

        for batch in range(
                int(mix_magnitude.shape[0] /
                    training_constants['batch_size'])):
            b_start = batch * training_constants['batch_size']
            b_end = (batch + 1) * training_constants['batch_size']

            v_in = torch.from_numpy(
                mix_magnitude[b_start:b_end, :, :]).to(device)

            tmp_voice_predicted = rnn_enc(v_in)
            tmp_voice_predicted = rnn_dec(tmp_voice_predicted)
            tmp_voice_predicted = fnn(tmp_voice_predicted, v_in)
            tmp_voice_predicted = denoiser(tmp_voice_predicted)

            voice_predicted[
                b_start:b_end, :, :] = tmp_voice_predicted.data.cpu().numpy()

        tmp_sdr, tmp_sir = data_process_results_testing(
            index=index,
            voice_true=voice_true,
            bg_true=bg_true,
            voice_predicted=voice_predicted,
            window_size=hyper_parameters['window_size'],
            mix=mix,
            mix_magnitude=mix_magnitude,
            mix_phase=mix_phase,
            hop=hyper_parameters['hop_size'],
            context_length=hyper_parameters['context_length'])

        e_time = time.time()

        print(
            testing_output_string_per_example.format(
                e=index,
                sdr=np.median([i for i in tmp_sdr[0] if not np.isnan(i)]),
                sir=np.median([i for i in tmp_sir[0] if not np.isnan(i)]),
                t=e_time - s_time))

        total_time += e_time - s_time

        sdr.append(tmp_sdr)
        sir.append(tmp_sir)

    print('\n-- Testing finished\n')
    print(
        testing_output_string_all.format(
            sdr=np.median([ii for i in sdr for ii in i[0]
                           if not np.isnan(ii)]),
            sir=np.median([ii for i in sir for ii in i[0]
                           if not np.isnan(ii)]),
            t=total_time))

    print('\n-- Saving results... ', end='')

    with open(metrics_paths['sdr'], 'wb') as f:
        pickle.dump(sdr, f, protocol=2)

    with open(metrics_paths['sir'], 'wb') as f:
        pickle.dump(sir, f, protocol=2)

    print('done!')
    print('-- That\'s all folks!')
Beispiel #4
0
def use_me_process(sources_list, output_file_names):
    """The usage process.

    :param sources_list: The file names to be used.
    :type sources_list: list[pathlib.Path]
    :param output_file_names: The output file names to be used.
    :type output_file_names: list[list[str]]
    """
    printing.print_msg('Welcome to MaD TwinNet.', end='\n\n')
    if debug:
        printing.print_msg('Cannot proceed in debug mode. '
                           'Please set `debug=False` at the settings '
                           'file.')
        printing.print_msg('Exiting.')
        exit(-1)
    printing.print_msg('Now I will extract the voice and the '
                       'background music from the provided files')

    device = 'cuda' if not debug and torch.cuda.is_available() else 'cpu'

    # MaD setting up
    mad = MaD(rnn_enc_input_dim=hyper_parameters['reduced_dim'],
              rnn_dec_input_dim=hyper_parameters['rnn_enc_output_dim'],
              original_input_dim=hyper_parameters['original_input_dim'],
              context_length=hyper_parameters['context_length'])

    mad.load_state_dict(torch.load(output_states_path['mad']))
    mad = mad.to(device).eval()

    testing_it = data_feeder.data_feeder_testing(
        window_size=hyper_parameters['window_size'],
        fft_size=hyper_parameters['fft_size'],
        hop_size=hyper_parameters['hop_size'],
        seq_length=hyper_parameters['seq_length'],
        context_length=hyper_parameters['context_length'],
        batch_size=1,
        debug=debug,
        sources_list=sources_list)

    printing.print_msg('Let\'s go!', end='\n\n')
    total_time = 0

    for index, data in enumerate(testing_it()):

        s_time = time.time()

        mix, mix_magnitude, mix_phase, voice_true, bg_true = data

        voice_predicted = np.zeros(
            (mix_magnitude.shape[0], hyper_parameters['seq_length'] -
             hyper_parameters['context_length'] * 2,
             hyper_parameters['window_size']),
            dtype=np.float32)

        for batch in range(
                int(mix_magnitude.shape[0] /
                    training_constants['batch_size'])):
            b_start = batch * training_constants['batch_size']
            b_end = (batch + 1) * training_constants['batch_size']

            v_in = torch.from_numpy(
                mix_magnitude[b_start:b_end, :, :]).to(device)

            voice_predicted[b_start:b_end, :, :] = mad(
                v_in).v_j_filt.cpu().numpy()

        data_feeder.data_process_results_testing(
            index=index,
            voice_true=voice_true,
            bg_true=bg_true,
            voice_predicted=voice_predicted,
            window_size=hyper_parameters['window_size'],
            mix=mix,
            mix_magnitude=mix_magnitude,
            mix_phase=mix_phase,
            hop=hyper_parameters['hop_size'],
            context_length=hyper_parameters['context_length'],
            output_file_name=output_file_names[index])

        e_time = time.time()

        printing.print_msg(
            usage_output_string_per_example.format(f=sources_list[index],
                                                   t=e_time - s_time))

        total_time += e_time - s_time

    printing.print_msg('MaDTwinNet finished')
    printing.print_msg(usage_output_string_total.format(t=total_time))
    printing.print_msg('That\'s all folks!')