def main(): # from model settings model_params = { 'nonlinearity': 'tanh', 'layer_size': 600, 'embedding_size': 40, 'normalize': 'False' } uid_settings = '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/assign_uids_LibriSpeech_UrbanSound8K.json' model_save_base = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/large_l41' model_location = '/cpu:0' model_settings = '' mixes = [ '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_test_in_sample.json' ] from_disk = True mix_number = 1 output_path = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/sample_wav_files/large_lab41' os.makedirs(output_path, exist_ok=True) mixer = MixIterator(mixes_settings_filenames=mixes, batch_size=1, from_disk=from_disk) # get frequency dimension frequency_dim = mixer.sample_dimensions()[0] # get number of sources settings = json.load(open(uid_settings)) uid_file = settings['output_file'] uid_csv = pd.read_csv(uid_file) number_of_sources = uid_csv['uid'].max() + 1 model = L41Model(**model_params, num_speakers=number_of_sources, F=frequency_dim, device=model_location) model.load(model_save_base) assert (mix_number <= mixer.epoch_size()) mix_settings = json.load(open(mixes[0])) signal = mix_settings['signals'][0] preprocessing_settings = json.load(open(signal['preprocessing_settings'])) stft_args = preprocessing_settings['processing_parameters']['stft_args'] istft_args = convert_preprocessing_parameters(stft_args) preemphasis_coeff = preprocessing_settings['processing_parameters'][ 'preemphasis_coeff'] n_fft = 2048 if 'n_fft' in stft_args: n_fft = stft_args['n_fft'] for i in range(mix_number): spec, bin_masks, source_specs, uids, snrs = next(mixer) model_spec = spec spec = spec[0] bin_masks = bin_masks[0] source_specs = source_specs[0] uids = uids[0] snrs = snrs[0] print('SNR of this mix: {}'.format(snrs)) y_mix = undo_preprocessing(spec, mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # NOTE: this is only to make comparisons to the reconstructed waveforms later y_mix[-n_fft:] = 0.0 y_mix = standardize_waveform(y_mix) # print('Mixed sample') lr.output.write_wav(os.path.join(output_path, 'mix_{}.wav'.format(mix_number)), y_mix, mixer.sample_rate(), norm=True) for i, source_spec in enumerate(source_specs): y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # NOTE: this is only to make comparisons to the reconstructed waveforms later y[-n_fft:] = 0.0 y = standardize_waveform(y) # print('Sample for source {}'.format(i + 1)) lr.output.write_wav(os.path.join( output_path, 'mix_{}_original_source_{}.wav'.format(mix_number, i + 1)), y, mixer.sample_rate(), norm=True) source_specs = l41_clustering_separate( model_spec, model, mixer.number_of_samples_in_mixes()) #, binary_mask=False) for i, source_spec in enumerate(source_specs): y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # NOTE: this is only because the masking creates a chirp in the last # fft frame (likely due to the mask) y[-n_fft:] = 0.0 y = standardize_waveform(y) # print('Separated sample for source {}'.format(i + 1)) lr.output.write_wav(os.path.join( output_path, 'mix_{}_separated_{}.wav'.format(mix_number, i + 1)), y, mixer.sample_rate(), norm=True)
def main(): # from model settings params = {} params['cf'] = 'kl' params['sparsity'] = 5 params['R'] = 1000 params['conv_eps'] = 1e-3 params['verbose'] = False T_L = 8 T_R = 0 random_seed = 1234567890 uid_settings = '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/assign_uids_LibriSpeech_UrbanSound8K.json' library_output_file = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/snmf/library_weights.hdf5' # library_output_file = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/snmf/REMOVE_library_weights.hdf5' model_settings = '' params['max_iter'] = 25 mixes = [ '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_test_out_of_sample.json' ] # mixes = ['/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_test_in_sample.json'] from_disk = True output_path = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/snmf/out_of_sample_test' # output_path = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/snmf/in_sample_test' eval_sr = 8000 params['rng'] = np.random.RandomState(random_seed) mixer = MixIterator(mixes_settings_filenames=mixes, batch_size=1, from_disk=from_disk) # get frequency dimension frequency_dim = mixer.sample_dimensions()[0] # get number of sources settings = json.load(open(uid_settings)) uid_file = settings['output_file'] uid_csv = pd.read_csv(uid_file) number_of_sources = uid_csv['uid'].max() + 1 model = SNMF(T_L, T_R, params['R'], params['sparsity'], params['cf']) model.load(library_output_file) mix_settings = json.load(open(mixes[0])) signal = mix_settings['signals'][0] preprocessing_settings = json.load(open(signal['preprocessing_settings'])) stft_args = preprocessing_settings['processing_parameters']['stft_args'] istft_args = convert_preprocessing_parameters(stft_args) preemphasis_coeff = preprocessing_settings['processing_parameters'][ 'preemphasis_coeff'] n_fft = 2048 if 'n_fft' in stft_args: n_fft = stft_args['n_fft'] os.makedirs(output_path, exist_ok=True) mix_count = 0 for _ in tqdm.trange(mixer.epoch_size()): spec, bin_masks, source_specs, uids, snrs = next(mixer) spec = spec[0] bin_masks = bin_masks[0] source_specs = source_specs[0] uids = uids[0] snrs = snrs[0] # print('SNR of mix {}: {}'.format(mix_count + 1, snrs)) y_mix = undo_preprocessing(spec, mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # NOTE: this is only to make comparisons to the reconstructed waveforms later y_mix[-n_fft:] = 0.0 y_mix = lr.core.resample(y_mix, mixer.sample_rate(), eval_sr, scale=True) y_mix = standardize_waveform(y_mix) filename = os.path.join( output_path, 'mix_{}_snr_{:.2f}.wav'.format(mix_count + 1, snrs)) lr.output.write_wav(filename, y_mix, eval_sr, norm=True) originals = {} for i, source_spec in enumerate(source_specs): y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # NOTE: this is only to make comparisons to the reconstructed waveforms later y[-n_fft:] = 0.0 y = lr.core.resample(y, mixer.sample_rate(), eval_sr, scale=True) y = standardize_waveform(y) originals[i] = y # use model to source-separate the spectrogram source_specs = model.source_separate(spec, max_iter=params['max_iter'], conv_eps=params['conv_eps'], rng=params['rng'], verbose=params['verbose']) # for i, source_spec in enumerate(source_specs): for i, source_spec in enumerate(source_specs.keys()): y = undo_preprocessing(source_specs[source_spec], mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # NOTE: this is only because the masking creates a chirp in the last # fft frame (likely due to the binary mask) y[-n_fft:] = 0.0 y = lr.core.resample(y, mixer.sample_rate(), eval_sr, scale=True) y = standardize_waveform(y) # match this waveform with an original source waveform min_key = 0 min_mse = np.inf for key in originals: mse = np.mean((y - originals[key])**2) if mse < min_mse: min_key = key min_mse = mse # print('Separated sample for source {}'.format(i + 1)) filename = os.path.join( output_path, 'mix_{}_original_source_{}.wav'.format(mix_count + 1, min_key + 1)) lr.output.write_wav(filename, originals[min_key], eval_sr, norm=True) filename = os.path.join( output_path, 'mix_{}_separated_source_{}.wav'.format( mix_count + 1, min_key + 1)) lr.output.write_wav(filename, y, eval_sr, norm=True) y_original = originals.pop(min_key, None) if y_original is None: print("something went horribly wrong") mix_count += 1
def main(): # from model settings model_params = { 'nonlinearity': 'tanh', 'layer_size': 600, 'embedding_size': 40, 'normalize': 'False' } uid_settings = '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/assign_uids_LibriSpeech_UrbanSound8K.json' model_save_base = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/l41' model_location = '/cpu:0' model_settings = '' mixes = [ '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_test_in_sample.json' ] from_disk = True mix_number = 1 output_path = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux' mixer = MixIterator(mixes_settings_filenames=mixes, batch_size=1, from_disk=from_disk) # get frequency dimension frequency_dim = mixer.sample_dimensions()[0] # get number of sources settings = json.load(open(uid_settings)) uid_file = settings['output_file'] uid_csv = pd.read_csv(uid_file) number_of_sources = uid_csv['uid'].max() + 1 model = L41Model(**model_params, num_speakers=number_of_sources, F=frequency_dim, device=model_location) model.load(model_save_base) assert (mix_number <= mixer.epoch_size()) mix_settings = json.load(open(mixes[0])) signal = mix_settings['signals'][0] preprocessing_settings = json.load(open(signal['preprocessing_settings'])) istft_args = convert_preprocessing_parameters( preprocessing_settings['processing_parameters']['stft_args']) preemphasis_coeff = preprocessing_settings['processing_parameters'][ 'preemphasis_coeff'] for i in range(mix_number): spec, bin_masks, source_specs, uids, snrs = next(mixer) model_spec = spec spec = spec[0] bin_masks = bin_masks[0] source_specs = source_specs[0] uids = uids[0] snrs = snrs[0] print('SNR of this mix: {}'.format(snrs)) y_mix = undo_preprocessing(spec, mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # print('Mixed sample') lr.output.write_wav('{}_mix.wav'.format(output_path), y_mix, mixer.sample_rate(), norm=True) for i, source_spec in enumerate(source_specs): y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # print('Sample for source {}'.format(i + 1)) lr.output.write_wav('{}_original_source_{}.wav'.format(output_path, i), y, mixer.sample_rate(), norm=True) source_specs = l41_clustering_separate(model_spec, model, mixer.number_of_samples_in_mixes()) for i, source_spec in enumerate(source_specs): y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # print('Separated sample for source {}'.format(i + 1)) lr.output.write_wav('{}_separated_source_{}.wav'.format( output_path, i), y, mixer.sample_rate(), norm=True)
def main(): # from model settings params = {} params['cf'] = 'kl' params['sparsity'] = 5 params['R'] = 1000 params['conv_eps'] = 1e-3 params['verbose'] = False T_L = 8 T_R = 0 random_seed = 1234567890 uid_settings = '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/assign_uids_LibriSpeech_UrbanSound8K.json' library_output_file = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/snmf/library_weights.hdf5' # library_output_file = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/snmf/REMOVE_library_weights.hdf5' params['max_iter'] = 25 mixes = ['/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_test_in_sample.json'] from_disk = True mix_number = 1 output_path = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/sample_wav_files/snmf' os.makedirs(output_path, exist_ok=True) params['rng'] = np.random.RandomState(random_seed) mixer = MixIterator(mixes_settings_filenames=mixes, batch_size=1, from_disk=from_disk) # get frequency dimension frequency_dim = mixer.sample_dimensions()[0] # get number of sources settings = json.load(open(uid_settings)) uid_file = settings['output_file'] uid_csv = pd.read_csv(uid_file) number_of_sources = uid_csv['uid'].max() + 1 model = SNMF(T_L, T_R, params['R'], params['sparsity'], params['cf']) model.load(library_output_file) assert(mix_number <= mixer.epoch_size()) mix_settings = json.load(open(mixes[0])) signal = mix_settings['signals'][0] preprocessing_settings = json.load(open(signal['preprocessing_settings'])) stft_args = preprocessing_settings['processing_parameters']['stft_args'] istft_args = convert_preprocessing_parameters(stft_args) preemphasis_coeff = preprocessing_settings['processing_parameters']['preemphasis_coeff'] n_fft = 2048 if 'n_fft' in stft_args: n_fft = stft_args['n_fft'] for i in range(mix_number): spec, bin_masks, source_specs, uids, snrs = next(mixer) spec = spec[0] bin_masks = bin_masks[0] source_specs = source_specs[0] uids = uids[0] snrs = snrs[0] print('SNR of this mix: {}'.format(snrs)) y_mix = undo_preprocessing(spec, mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # NOTE: this is only to make comparisons to the reconstructed waveforms later y_mix[-n_fft:] = 0.0 y_mix = standardize_waveform(y_mix) lr.output.write_wav(os.path.join(output_path, 'mix_{}.wav'.format(mix_number)), y_mix, mixer.sample_rate(), norm=True) for i, source_spec in enumerate(source_specs): y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # NOTE: this is only to make comparisons to the reconstructed waveforms later y[-n_fft:] = 0.0 y = standardize_waveform(y) lr.output.write_wav(os.path.join(output_path, 'mix_{}_original_source_{}.wav'.format(mix_number, i + 1)), y, mixer.sample_rate(), norm=True) source_specs = model.source_separate(spec, max_iter=params['max_iter'], conv_eps=params['conv_eps'], rng=params['rng'], verbose=params['verbose']) for source_spec in source_specs: y = undo_preprocessing(source_specs[source_spec], mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # NOTE: this is only because the masking creates a chirp in the last # fft frame (likely due to the mask) y[-n_fft:] = 0.0 y = standardize_waveform(y) lr.output.write_wav(os.path.join(output_path, 'mix_{}_separated_{}.wav'.format(mix_number, source_spec)), y, mixer.sample_rate(), norm=True)
def main(): # from model settings model_params = { } uid_settings = '/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/assign_uids_LibriSpeech_UrbanSound8K.json' model_save_base = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/model_saves/chimera' model_location = '/cpu:0' model_settings = '' # mixes = ['/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_test_in_sample.json'] mixes = ['/local_data/magnolia/pipeline_data/date_2017_09_27_time_13_25/settings/mixing_LibriSpeech_UrbanSound8K_test_out_of_sample.json'] from_disk = True # output_path = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/chimera/in_sample_test' output_path = '/local_data/magnolia/experiment_data/date_2017_09_28_time_13_14/aux/evaluations/chimera/out_of_sample_test' eval_sr = 8000 mixer = MixIterator(mixes_settings_filenames=mixes, batch_size=1, from_disk=from_disk) # get frequency dimension frequency_dim = mixer.sample_dimensions()[0] # get number of sources settings = json.load(open(uid_settings)) uid_file = settings['output_file'] uid_csv = pd.read_csv(uid_file) number_of_sources = uid_csv['uid'].max() + 1 model = Chimera(**model_params, F=frequency_dim, device=model_location) model.load(model_save_base) mix_settings = json.load(open(mixes[0])) signal = mix_settings['signals'][0] preprocessing_settings = json.load(open(signal['preprocessing_settings'])) stft_args = preprocessing_settings['processing_parameters']['stft_args'] istft_args = convert_preprocessing_parameters(stft_args) preemphasis_coeff = preprocessing_settings['processing_parameters']['preemphasis_coeff'] n_fft = 2048 if 'n_fft' in stft_args: n_fft = stft_args['n_fft'] os.makedirs(output_path, exist_ok=True) mix_count = 0 for _ in tqdm.trange(mixer.epoch_size()): spec, bin_masks, source_specs, uids, snrs = next(mixer) model_spec = spec spec = spec[0] bin_masks = bin_masks[0] source_specs = source_specs[0] uids = uids[0] snrs = snrs[0] # print('SNR of mix {}: {}'.format(mix_count + 1, snrs)) y_mix = undo_preprocessing(spec, mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # NOTE: this is only to make comparisons to the reconstructed waveforms later y_mix[-n_fft:] = 0.0 y_mix = lr.core.resample(y_mix, mixer.sample_rate(), eval_sr, scale=True) y_mix = standardize_waveform(y_mix) filename = os.path.join(output_path, 'mix_{}_snr_{:.2f}.wav'.format(mix_count + 1, snrs)) lr.output.write_wav(filename, y_mix, eval_sr, norm=True) originals = {} for i, source_spec in enumerate(source_specs): y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # NOTE: this is only to make comparisons to the reconstructed waveforms later y[-n_fft:] = 0.0 y = lr.core.resample(y, mixer.sample_rate(), eval_sr, scale=True) y = standardize_waveform(y) originals[i] = y # use dc-head of model + clustering to source-separate the spectrogram source_specs = chimera_clustering_separate(model_spec, model, mixer.number_of_samples_in_mixes()) for i, source_spec in enumerate(source_specs): y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # NOTE: this is only because the masking creates a chirp in the last # fft frame (likely due to the binary mask) y[-n_fft:] = 0.0 y = lr.core.resample(y, mixer.sample_rate(), eval_sr, scale=True) y = standardize_waveform(y) # match this waveform with an original source waveform min_key = 0 min_mse = np.inf for key in originals: mse = np.mean((y - originals[key])**2) if mse < min_mse: min_key = key min_mse = mse # print('Separated sample for source {}'.format(i + 1)) filename = os.path.join(output_path, 'mix_{}_original_source_{}.wav'.format(mix_count + 1, min_key + 1)) lr.output.write_wav(filename, originals[min_key], eval_sr, norm=True) filename = os.path.join(output_path, 'mix_{}_dc_separated_source_{}.wav'.format(mix_count + 1, min_key + 1)) lr.output.write_wav(filename, y, eval_sr, norm=True) y_original = originals.pop(min_key, None) if y_original is None: print("something went horribly wrong") # use mi-head of model to source-separate the spectrogram source_specs = chimera_mask(model_spec, model)[0] for i in range(source_specs.shape[2]): source_spec = source_specs[:, :, i] y = undo_preprocessing(source_spec, mixer.sample_length_in_bits(), preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) # NOTE: this is only because the masking creates a chirp in the last # fft frame (likely due to the binary mask) y[-n_fft:] = 0.0 y = lr.core.resample(y, mixer.sample_rate(), eval_sr, scale=True) y = standardize_waveform(y) filename = os.path.join(output_path, 'mix_{}_mi_separated_source_{}.wav'.format(mix_count + 1, i + 1)) lr.output.write_wav(filename, y, eval_sr, norm=True) mix_count += 1
def main(): # parse command line arguments parser = argparse.ArgumentParser( description='Reconstruct waveforms from mixes.') parser.add_argument('--sample', '-n', default=1, type=int, help='sample number to write to file (1-indexed)') parser.add_argument('--output_file', '-o', default='mix.wav', help='output file name (wav format)') parser.add_argument('--settings', '-s', default='../../settings/mixing_template.json', help='sample mixing settings JSON file') parser.add_argument('--logger_settings', '-l', default='../../settings/logging.conf', help='logging configuration file') args = parser.parse_args() # Load logging configuration logging.config.fileConfig(args.logger_settings) logger = logging.getLogger('iteration') mixer = MixIterator([args.settings], batch_size=1) mixer_iter = iter(mixer) with open(args.settings) as settings_file: settings = json.load(settings_file) total_number_of_mixed_samples = settings['number_of_mixed_samples'] assert (args.sample <= total_number_of_mixed_samples and args.sample > 0) signal = settings['signals'][0] preprocessing_settings = json.load( open(signal['preprocessing_settings'])) istft_args = convert_preprocessing_parameters( preprocessing_settings['processing_parameters']['stft_args']) preemphasis_coeff = preprocessing_settings['processing_parameters'][ 'preemphasis_coeff'] sample_rate = preprocessing_settings['processing_parameters'][ 'target_sample_rate'] sample_length = settings['target_sample_length'] total_length = int(sample_length * sample_rate) for i in range(args.sample): spec, bin_masks, source_specs, uids, snrs = next(mixer_iter) spec = spec[0] bin_masks = bin_masks[0] uids = uids[0] snrs = snrs[0] print('SNR of this mix: {}'.format(snrs)) mix_file_name = '{}_mix.wav'.format( os.path.splitext(args.output_file)[0]) y = undo_preprocessing(spec, total_length, preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) lr.output.write_wav(mix_file_name, y, sample_rate, norm=True) for i in range(bin_masks.shape[0]): source_file_name = '{}_{}.wav'.format( os.path.splitext(args.output_file)[0], uids[i]) source_spec = apply_binary_mask(bin_masks[i], spec) source_y = undo_preprocessing(source_spec, total_length, preemphasis_coeff=preemphasis_coeff, istft_args=istft_args) lr.output.write_wav(source_file_name, source_y, sample_rate, norm=True)