Example #1
0
 def save_filtered_result(self):
     util.write_wav(self.filtered_wav)
Example #2
0
def explain(model, input, output_filename_prefix, sample_rate, output_path):

    batch_size = 1
    if len(input['noisy']) < model.receptive_field_length:
        raise ValueError(
            'Input is not long enough to be used with this model.')

    num_output_samples = input['noisy'].shape[0] - (
        model.receptive_field_length - 1)
    num_fragments = int(np.ceil(num_output_samples /
                                model.target_field_length))
    num_batches = int(np.ceil(num_fragments / batch_size))

    denoised_output = []
    noise_output = []
    num_pad_values = 0
    fragment_i = 0
    for batch_i in tqdm.tqdm(range(0, num_batches)):

        if batch_i == num_batches - 1:  #If its the last batch'
            batch_size = num_fragments - batch_i * batch_size

        input_batch = np.zeros((batch_size, model.input_length))

        #Assemble batch
        for batch_fragment_i in range(0, batch_size):

            if fragment_i + model.target_field_length > num_output_samples:
                remainder = input['noisy'][fragment_i:]
                current_fragment = np.zeros((model.input_length, ))
                current_fragment[:remainder.shape[0]] = remainder
                num_pad_values = model.input_length - remainder.shape[0]
            else:
                current_fragment = input['noisy'][fragment_i:fragment_i +
                                                  model.input_length]

            input_batch[batch_fragment_i, :] = current_fragment
            fragment_i += model.target_field_length

        denoised_output_fragments = model.denoise_batch(
            {'data_input': input_batch})
        layer_outputs = model.get_layer_outputs(input_batch)
        plot_layer_outputs(layer_outputs, 2, output_path)
        if type(denoised_output_fragments) is list:
            noise_output_fragment = denoised_output_fragments[1]
            denoised_output_fragment = denoised_output_fragments[0]

        denoised_output_fragment = denoised_output_fragment[:, model.
                                                            target_padding:
                                                            model.
                                                            target_padding +
                                                            model.
                                                            target_field_length]
        denoised_output_fragment = denoised_output_fragment.flatten().tolist()

        if noise_output_fragment is not None:
            noise_output_fragment = noise_output_fragment[:, model.
                                                          target_padding:model.
                                                          target_padding +
                                                          model.
                                                          target_field_length]
            noise_output_fragment = noise_output_fragment.flatten().tolist()

        if type(denoised_output_fragments) is float:
            denoised_output_fragment = [denoised_output_fragment]
        if type(noise_output_fragment) is float:
            noise_output_fragment = [noise_output_fragment]

        denoised_output = denoised_output + denoised_output_fragment
        noise_output = noise_output + noise_output_fragment

    denoised_output = np.array(denoised_output)
    noise_output = np.array(noise_output)

    if num_pad_values != 0:
        denoised_output = denoised_output[:-num_pad_values]
        noise_output = noise_output[:-num_pad_values]

    valid_noisy_signal = input['noisy'][model.half_receptive_field_length:model
                                        .half_receptive_field_length +
                                        len(denoised_output)]

    if input['clean'] is not None:
        input['noise'] = input['noisy'] - input['clean']

        valid_clean_signal = input['clean'][model.half_receptive_field_length:
                                            model.half_receptive_field_length +
                                            len(denoised_output)]

        noise_in_denoised_output = denoised_output - valid_clean_signal

        rms_clean = util.rms(valid_clean_signal)
        rms_noise_out = util.rms(noise_in_denoised_output)
        rms_noise_in = util.rms(input['noise'])

        new_snr_db = int(np.round(util.snr_db(rms_clean, rms_noise_out)))
        initial_snr_db = int(np.round(util.snr_db(rms_clean, rms_noise_in)))

        output_clean_filename = output_filename_prefix + 'clean.wav'
        output_clean_filepath = os.path.join(output_path,
                                             output_clean_filename)
        util.write_wav(valid_clean_signal, output_clean_filepath, sample_rate)

        output_denoised_filename = output_filename_prefix + 'denoised_%ddB.wav' % new_snr_db
        output_noisy_filename = output_filename_prefix + 'noisy_%ddB.wav' % initial_snr_db
    else:
        output_denoised_filename = output_filename_prefix + 'denoised.wav'
        output_noisy_filename = output_filename_prefix + 'noisy.wav'

    output_noise_filename = output_filename_prefix + 'noise.wav'

    output_denoised_filepath = os.path.join(output_path,
                                            output_denoised_filename)
    output_noisy_filepath = os.path.join(output_path, output_noisy_filename)
    output_noise_filepath = os.path.join(output_path, output_noise_filename)

    util.write_wav(denoised_output, output_denoised_filepath, sample_rate)
    util.write_wav(valid_noisy_signal, output_noisy_filepath, sample_rate)
    util.write_wav(noise_output, output_noise_filepath, sample_rate)
def separate_sample(model, input, batch_size, output_filename_prefix, sample_rate, output_path, target):

    if target == 'singing-voice':

        if len(input['mixture']) < model.receptive_field_length:
            raise ValueError('Input is not long enough to be used with this model.')

        num_output_samples = input['mixture'].shape[0] - (model.receptive_field_length - 1)
        num_fragments = int(np.ceil(num_output_samples / model.target_field_length))
        num_batches = int(np.ceil(num_fragments / batch_size))

        vocals_output = []
        num_pad_values = 0
        fragment_i = 0
        for batch_i in tqdm.tqdm(range(0, num_batches)):

            if batch_i == num_batches - 1:  # If its the last batch
                batch_size = num_fragments - batch_i * batch_size

            input_batch = np.zeros((batch_size, model.input_length))

            # Assemble batch
            for batch_fragment_i in range(0, batch_size):

                if fragment_i + model.target_field_length > num_output_samples:
                    remainder = input['mixture'][fragment_i:]
                    current_fragment = np.zeros((model.input_length,))
                    current_fragment[:remainder.shape[0]] = remainder
                    num_pad_values = model.input_length - remainder.shape[0]
                else:
                    current_fragment = input['mixture'][fragment_i:fragment_i + model.input_length]

                input_batch[batch_fragment_i, :] = current_fragment
                fragment_i += model.target_field_length

            separated_output_fragments = model.separate_batch({'data_input': input_batch})

            if type(separated_output_fragments) is list:
                vocals_output_fragment = separated_output_fragments[0]

            vocals_output_fragment = vocals_output_fragment[:,
                                     model.target_padding: model.target_padding + model.target_field_length]
            vocals_output_fragment = vocals_output_fragment.flatten().tolist()

            if type(separated_output_fragments) is float:
                vocals_output_fragment = [vocals_output_fragment]

            vocals_output = vocals_output + vocals_output_fragment

        vocals_output = np.array(vocals_output)

        if num_pad_values != 0:
            vocals_output = vocals_output[:-num_pad_values]

        mixture_valid_signal = input['mixture'][
                               model.half_receptive_field_length:model.half_receptive_field_length + len(vocals_output)]

        accompaniment_output = mixture_valid_signal - vocals_output

        output_vocals_filename = output_filename_prefix + '_vocals.wav'
        output_accompaniment_filename = output_filename_prefix + '_accompaniment.wav'

        output_vocals_filepath = os.path.join(output_path, output_vocals_filename)
        output_accompaniment_filepath = os.path.join(output_path, output_accompaniment_filename)

        util.write_wav(vocals_output, output_vocals_filepath, sample_rate)
        util.write_wav(accompaniment_output, output_accompaniment_filepath, sample_rate)

    if target == 'multi-instrument':

        if len(input['mixture']) < model.receptive_field_length:
            raise ValueError('Input is not long enough to be used with this model.')

        num_output_samples = input['mixture'].shape[0] - (model.receptive_field_length - 1)
        num_fragments = int(np.ceil(num_output_samples / model.target_field_length))
        num_batches = int(np.ceil(num_fragments / batch_size))

        vocals_output = []
        drums_output = []
        bass_output = []

        num_pad_values = 0
        fragment_i = 0
        for batch_i in tqdm.tqdm(range(0, num_batches)):

            if batch_i == num_batches - 1:  # If its the last batch
                batch_size = num_fragments - batch_i * batch_size

            input_batch = np.zeros((batch_size, model.input_length))

            # Assemble batch
            for batch_fragment_i in range(0, batch_size):

                if fragment_i + model.target_field_length > num_output_samples:
                    remainder = input['mixture'][fragment_i:]
                    current_fragment = np.zeros((model.input_length,))
                    current_fragment[:remainder.shape[0]] = remainder
                    num_pad_values = model.input_length - remainder.shape[0]
                else:
                    current_fragment = input['mixture'][fragment_i:fragment_i + model.input_length]

                input_batch[batch_fragment_i, :] = current_fragment
                fragment_i += model.target_field_length

            separated_output_fragments = model.separate_batch({'data_input': input_batch})

            if type(separated_output_fragments) is list:
                vocals_output_fragment = separated_output_fragments[0]
                drums_output_fragment = separated_output_fragments[1]
                bass_output_fragment = separated_output_fragments[2]

            vocals_output_fragment = vocals_output_fragment[:,
                                     model.target_padding: model.target_padding + model.target_field_length]
            vocals_output_fragment = vocals_output_fragment.flatten().tolist()

            drums_output_fragment = drums_output_fragment[:,
                                    model.target_padding: model.target_padding + model.target_field_length]
            drums_output_fragment = drums_output_fragment.flatten().tolist()

            bass_output_fragment = bass_output_fragment[:,
                                   model.target_padding: model.target_padding + model.target_field_length]
            bass_output_fragment = bass_output_fragment.flatten().tolist()

            if type(separated_output_fragments) is float:
                vocals_output_fragment = [vocals_output_fragment]
            if type(drums_output_fragment) is float:
                drums_output_fragment = [drums_output_fragment]
            if type(bass_output_fragment) is float:
                bass_output_fragment = [bass_output_fragment]

            vocals_output = vocals_output + vocals_output_fragment
            drums_output = drums_output + drums_output_fragment
            bass_output = bass_output + bass_output_fragment

        vocals_output = np.array(vocals_output)
        drums_output = np.array(drums_output)
        bass_output = np.array(bass_output)

        if num_pad_values != 0:
            vocals_output = vocals_output[:-num_pad_values]
            drums_output = drums_output[:-num_pad_values]
            bass_output = bass_output[:-num_pad_values]

        mixture_valid_signal = input['mixture'][
                               model.half_receptive_field_length:model.half_receptive_field_length + len(vocals_output)]

        other_output = mixture_valid_signal - vocals_output - drums_output - bass_output

        output_vocals_filename = output_filename_prefix + '_vocals.wav'
        output_drums_filename = output_filename_prefix + '_drums.wav'
        output_bass_filename = output_filename_prefix + '_bass.wav'
        output_other_filename = output_filename_prefix + '_other.wav'

        output_vocals_filepath = os.path.join(output_path, output_vocals_filename)
        output_drums_filepath = os.path.join(output_path, output_drums_filename)
        output_bass_filepath = os.path.join(output_path, output_bass_filename)
        output_other_filepath = os.path.join(output_path, output_other_filename)

        util.write_wav(vocals_output, output_vocals_filepath, sample_rate)
        util.write_wav(drums_output, output_drums_filepath, sample_rate)
        util.write_wav(bass_output, output_bass_filepath, sample_rate)
        util.write_wav(other_output, output_other_filepath, sample_rate)
Example #4
0
def denoise_sample(model,
                   input,
                   condition_input,
                   batch_size,
                   output_filename_prefix,
                   sample_rate,
                   n_spk,
                   n_channel,
                   output_path,
                   save_wav=False,
                   spk_gender=None,
                   use_pit=False,
                   pad=False):
    if pad:
        noisy_pad = np.zeros(
            (model.half_receptive_field_length * 2 + len(input['noisy'])))
        noisy_pad[model.half_receptive_field_length:model.
                  half_receptive_field_length +
                  len(input['noisy'])] = input['noisy']
        input['noisy'] = noisy_pad

    if len(input['noisy']) < model.receptive_field_length:
        raise ValueError(
            'Input is not long enough to be used with this model.')

    num_output_samples = input['noisy'].shape[0] - (
        model.receptive_field_length - 1)
    num_fragments = int(np.ceil(num_output_samples /
                                model.target_field_length))
    num_batches = int(np.ceil(num_fragments / batch_size))

    ch_gender = {'ch1': {'M': 0, 'F': 0}, 'ch2': {'M': 0, 'F': 0}}

    # output_1 = []
    # output_2 = []
    output = [[] for _ in range(n_channel)]
    num_pad_values = 0
    fragment_i = 0

    for batch_i in tqdm.tqdm(range(0, num_batches)):

        if batch_i == num_batches - 1:  #If its the last batch'
            batch_size = num_fragments - batch_i * batch_size

        # condition_batch = np.array([condition_input, ] * batch_size, dtype='uint8')
        input_batch = np.zeros((batch_size, model.input_length))

        #Assemble batch
        for batch_fragment_i in range(0, batch_size):

            if fragment_i + model.target_field_length > num_output_samples:
                remainder = input['noisy'][fragment_i:]
                current_fragment = np.zeros((model.input_length, ))
                current_fragment[:remainder.shape[0]] = remainder
                num_pad_values = model.input_length - remainder.shape[0]
            else:
                current_fragment = input['noisy'][fragment_i:fragment_i +
                                                  model.input_length]

            input_batch[batch_fragment_i, :] = current_fragment
            fragment_i += model.target_field_length

        # output_1_fragments = model.denoise_batch({'data_input': input_batch, 'condition_input': condition_batch})
        input_batch = np.concatenate([
            np.expand_dims(input_batch, 0),
            np.zeros_like(np.expand_dims(input_batch, 0))
        ])
        input_batch = np.transpose(input_batch, (1, 0, 2))

        output_fragments = model.denoise_batch({'data_input': input_batch})
        output_fragments = output_fragments[:, :, model.target_padding:model.
                                            target_padding +
                                            model.target_field_length]

        for i in range(n_channel):
            output[i] += output_fragments[:, i].flatten().tolist()
        # output_1_fragment = output_fragments[:, 0]
        # output_2_fragment = output_fragments[:, 1]

        # output_1_fragment = output_1_fragment[:, model.target_padding: model.target_padding + model.target_field_length]
        # output_1_fragment = output_1_fragment.flatten().tolist()

        # output_2_fragment = output_2_fragment[:, model.target_padding: model.target_padding + model.target_field_length]
        # output_2_fragment = output_2_fragment.flatten().tolist()

        # output_1 = output_1 + output_1_fragment
        # output_2 = output_2 + output_2_fragment
    output = np.array(output)
    # output_1 = np.array(output_1)
    # output_2 = np.array(output_2)

    if num_pad_values != 0:
        output = output[:, :-num_pad_values]
        # output_1 = output_1[:-num_pad_values]
        # output_2 = output_2[:-num_pad_values]

    voice_len = len(output[0])
    valid_noisy_signal = input['noisy'][model.half_receptive_field_length:model
                                        .half_receptive_field_length +
                                        voice_len]
    valid_clean_signal_1 = input['clean_1'][
        model.half_receptive_field_length:model.half_receptive_field_length +
        voice_len] if not pad else input['clean_1']
    valid_clean_signal_2 = input['clean_2'][
        model.half_receptive_field_length:model.half_receptive_field_length +
        voice_len] if not pad else input['clean_2']

    if use_pit == True:
        pit_output_1 = []
        pit_output_2 = []
        pit_idx1 = []
        pit_idx2 = []
        for f in range(num_fragments):
            c1 = valid_clean_signal_1[f * model.target_field_length:(f + 1) *
                                      model.target_field_length]
            c2 = valid_clean_signal_2[f * model.target_field_length:(f + 1) *
                                      model.target_field_length]

            o = output[:, f * model.target_field_length:(f + 1) *
                       model.target_field_length]
            # o1 = output_1[f*model.target_field_length:(f+1)*model.target_field_length]
            # o2 = output_2[f*model.target_field_length:(f+1)*model.target_field_length]
            perms = np.array(
                list(itertools.permutations(range(n_channel), n_spk)))
            perms_onehot = (np.arange(perms.max() +
                                      1) == perms[..., None]).astype(int)

            cross_loss = np.expand_dims(np.array([c1, c2]),
                                        1) - np.expand_dims(o, 0)
            cross_loss_abs = np.sum(np.abs(cross_loss), 2)
            loss_sets = np.einsum('ij,pij->p', cross_loss_abs, perms_onehot)
            best_perm = perms[np.argmin(loss_sets)]

            pit_output_1 += o[best_perm[0]].tolist()
            pit_output_2 += o[best_perm[1]].tolist()
            pit_idx1.append(best_perm[0])
            pit_idx2.append(best_perm[1])
            # perm = np.argmin([np.sum(np.abs(c1-o1)+np.abs(c2-o2)), np.sum(np.abs(c1-o2)+np.abs(c2-o1))])
            # if perm == 0:
            # ch_gender['ch1'][spk_gender[0]] += 1
            # ch_gender['ch2'][spk_gender[1]] += 1
            # pit_output_1 += o1.tolist()
            # pit_output_2 += o2.tolist()
            # else:
            # ch_gender['ch1'][spk_gender[1]] += 1
            # ch_gender['ch2'][spk_gender[0]] += 1
            # pit_output_1 += o2.tolist()
            # pit_output_2 += o1.tolist()

        # valid_clean_signal_1 = np.zeros_like(valid_clean_signal_1)
        # valid_clean_signal_1.fill(1e-16)
        # pit_output_1 = np.zeros_like(pit_output_1)
        # pit_output_1.fill(1e-14)

        clean_wav = np.array([valid_clean_signal_1, valid_clean_signal_2])
        noisy_wav = np.array([pit_output_1, pit_output_2])

        _sdr1, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources(
            clean_wav[0], noisy_wav[0])
        _sdr2, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources(
            clean_wav[1], noisy_wav[1])

        # s1 = signal_to_distortion_ratio(clean_wav[0], noisy_wav[0])
        # s2 = signal_to_distortion_ratio(clean_wav[1], noisy_wav[1])
        # print(s1, s2)

        return np.array([_sdr1, _sdr2]), ch_gender, [pit_idx1, pit_idx2]

    else:
        clean_wav = np.array([valid_clean_signal_1, valid_clean_signal_2])

        perms = np.array(list(itertools.permutations(range(n_channel), n_spk)))
        perms_onehot = (np.arange(perms.max() + 1) == perms[...,
                                                            None]).astype(int)

        cross_loss = np.expand_dims(clean_wav, 1) - np.expand_dims(output, 0)
        cross_loss_abs = np.sum(np.abs(cross_loss), 2)
        loss_sets = np.einsum('ij,pij->p', cross_loss_abs, perms_onehot)
        best_perm = perms[np.argmin(loss_sets)]

        _sdr1, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources(
            clean_wav[0], output[best_perm[0]])
        _sdr2, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources(
            clean_wav[1], output[best_perm[1]])

        # noisy_wav = np.array([output_1, output_2])
        return np.array([_sdr1, _sdr2]), ch_gender, best_perm
        # _sdr1, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources(np.array(clean_wav[0]), np.array(noisy_wav[0]))
        # _sdr2, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources(np.array(clean_wav[1]), np.array(noisy_wav[1]))
        # _sdr3, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources(np.array(clean_wav[0]), np.array(noisy_wav[1]))
        # _sdr4, _sir, _sar, _popt = mir_eval.separation.bss_eval_sources(np.array(clean_wav[1]), np.array(noisy_wav[0]))
        # if _sdr1 + _sdr2 > _sdr3 + _sdr4:
        #     ch_gender['ch1'][spk_gender[0]] += 1
        #     ch_gender['ch2'][spk_gender[1]] += 1
        #     return np.array([_sdr1, _sdr2]), ch_gender
        # else:
        #     ch_gender['ch1'][spk_gender[1]] += 1
        #     ch_gender['ch2'][spk_gender[0]] += 1
        #     return np.array([_sdr3, _sdr4]), ch_gender

    if save_wav:
        output_original_filename = output_filename_prefix + 'orig.wav'
        output_s1_filename = output_filename_prefix + 's1.wav'
        output_s2_filename = output_filename_prefix + 's2.wav'

        output_original_filepath = os.path.join(output_path,
                                                output_original_filename)
        output_s1_filepath = os.path.join(output_path, output_s1_filename)
        output_s2_filepath = os.path.join(output_path, output_s2_filename)
        print(output_denoised_filepath)

        util.write_wav(valid_noisy_signal, output_original_filepath,
                       sample_rate)
        util.write_wav(output_1, output_s1_filepath, sample_rate)
        util.write_wav(output_2, output_s2_filepath, sample_rate)