def shift_f0(audio_features, pitch_shift=0.0): """Shift f0 by a number of ocatves.""" audio_features['f0_hz'] *= 2.0 ** (pitch_shift) audio_features['f0_hz'] = np.clip(audio_features['f0_hz'], 0.0, librosa.midi_to_hz(110.0)) return audio_features mask_on = None if ADJUST and DATASET_STATS is not None: # Detect sections that are "on". mask_on, note_on_value = detect_notes(audio_features['loudness_db'], audio_features['f0_confidence'], threshold) if np.any(mask_on): # Shift the pitch register. target_mean_pitch = DATASET_STATS['mean_pitch'] pitch = ddsp.core.hz_to_midi(audio_features['f0_hz']) mean_pitch = np.mean(pitch[mask_on]) p_diff = target_mean_pitch - mean_pitch p_diff_octave = p_diff / 12.0 round_fn = np.floor if p_diff_octave > 1.5 else np.ceil p_diff_octave = round_fn(p_diff_octave) audio_features_mod = shift_f0(audio_features_mod, p_diff_octave) # Quantile shift the note_on parts.
def timbre_transfer(ckpt_dir, audio, in_sample_rate, out_sample_rate, f0_octave_shift, f0_confidence_threshold, loudness_db_shift, adjust, quiet, autotune, log=print): log("converting audio...") start_time = time.time() audio = librosa.to_mono(audio) audio = librosa.resample(audio, in_sample_rate, out_sample_rate) audio = audio[np.newaxis, :] duration = time.time() - start_time log("done - {:.1f} s".format(duration)) # Setup the session. ddsp.spectral_ops.reset_crepe() # Compute features. log("computing audio features...") start_time = time.time() audio_features = ddsp.training.metrics.compute_audio_features(audio) audio_features['loudness_db'] = audio_features['loudness_db'].astype( np.float32) audio_features_mod = None duration = time.time() - start_time log("done - {:.1f} s".format(duration)) model_dir = ckpt_dir gin_file = os.path.join(model_dir, 'operative_config-0.gin') dataset_stats = None dataset_stats_file = os.path.join(model_dir, 'dataset_statistics.pkl') log(f'Loading dataset statistics from {dataset_stats_file}') try: if tf.io.gfile.exists(dataset_stats_file): with tf.io.gfile.GFile(dataset_stats_file, 'rb') as f: dataset_stats = pickle.load(f) except Exception as err: traceback.print_exc(file=sys.stderr) log('Loading dataset statistics from pickle failed!') # Parse gin config, with gin.unlock_config(): gin.parse_config_file(gin_file, skip_unknown=True) # Assumes only one checkpoint in the folder, 'ckpt-[iter]`. ckpt_files = [f for f in tf.io.gfile.listdir(model_dir) if 'ckpt' in f] ckpt_name = ckpt_files[0].split('.')[0] ckpt = os.path.join(model_dir, ckpt_name) # Ensure dimensions and sampling rates are equal time_steps_train = gin.query_parameter('F0LoudnessPreprocessor.time_steps') n_samples_train = gin.query_parameter('Harmonic.n_samples') hop_size = int(n_samples_train / time_steps_train) time_steps = int(audio.shape[1] / hop_size) n_samples = time_steps * hop_size gin_params = [ 'Harmonic.n_samples = {}'.format(n_samples), 'FilteredNoise.n_samples = {}'.format(n_samples), 'F0LoudnessPreprocessor.time_steps = {}'.format(time_steps), 'oscillator_bank.use_angular_cumsum = True', # Avoids cumsum accumulation errors. ] with gin.unlock_config(): gin.parse_config(gin_params) # Trim all input vectors to correct lengths for key in ['f0_hz', 'f0_confidence', 'loudness_db']: audio_features[key] = audio_features[key][:time_steps] audio_features['audio'] = audio_features['audio'][:, :n_samples] # Set up the model just to predict audio given new conditioning log("restoring model...") start_time = time.time() model = ddsp.training.models.Autoencoder() model.restore(ckpt) # Build model by running a batch through it. _ = model(audio_features, training=False) duration = time.time() - start_time log("done - {:.1f} s".format(duration)) # Modify conditioning audio_features_mod = {k: v.copy() for k, v in audio_features.items()} mask_on = None if adjust and dataset_stats != None: mask_on, note_on_value = detect_notes(audio_features['loudness_db'], audio_features['f0_confidence'], f0_confidence_threshold) if np.any(mask_on): # Shift the pitch register. target_mean_pitch = dataset_stats['mean_pitch'] pitch = ddsp.core.hz_to_midi(audio_features['f0_hz']) mean_pitch = np.mean(pitch[mask_on]) p_diff = target_mean_pitch - mean_pitch p_diff_octave = p_diff / 12.0 round_fn = np.floor if p_diff_octave > 1.5 else np.ceil p_diff_octave = round_fn(p_diff_octave) audio_features_mod = shift_f0(audio_features_mod, p_diff_octave) # Quantile shift the note_on parts. _, loudness_norm = colab_utils.fit_quantile_transform( audio_features['loudness_db'], mask_on, inv_quantile=dataset_stats['quantile_transform']) # Turn down the note_off parts. mask_off = np.logical_not(mask_on) loudness_norm[mask_off] -= quiet * ( 1.0 - note_on_value[mask_off][:, np.newaxis]) loudness_norm = np.reshape(loudness_norm, audio_features['loudness_db'].shape) audio_features_mod['loudness_db'] = loudness_norm # Auto-tune. if autotune: f0_midi = np.array( ddsp.core.hz_to_midi(audio_features_mod['f0_hz'])) tuning_factor = get_tuning_factor( f0_midi, audio_features_mod['f0_confidence'], mask_on) f0_midi_at = auto_tune(f0_midi, tuning_factor, mask_on, amount=autotune) audio_features_mod['f0_hz'] = ddsp.core.midi_to_hz(f0_midi_at) # else: # log('\nSkipping auto-adjust (no notes detected or ADJUST box empty).') # else: # log('\nSkipping auto-adujst (box not checked or no dataset statistics found).') audio_features_mod = shift_ld(audio_features_mod, loudness_db_shift) audio_features_mod = shift_f0(audio_features_mod, f0_octave_shift) audio_features_mod = mask_by_confidence(audio_features_mod, f0_confidence_threshold) # Resynthesize audio af = audio_features if audio_features_mod is None else audio_features_mod # Run a batch of predictions. log("predicting...") start_time = time.time() outputs = model(af, training=False) audio_gen = model.get_audio_from_outputs(outputs) duration = time.time() - start_time log("done - {:.1f} s".format(duration)) return audio_gen
def tranfer(audio, model_dir, sample_rate = DEFAULT_SAMPLE_RATE): audio = audio[np.newaxis, :] ddsp.spectral_ops.reset_crepe() audio_features = ddsp.training.metrics.compute_audio_features(audio) audio_features['loudness_db'] = audio_features['loudness_db'].astype(np.float32) audio_features_mod = None gin_file = os.path.join(model_dir, 'operative_config-0.gin') # Load the dataset statistics. DATASET_STATS = None dataset_stats_file = os.path.join(model_dir, 'dataset_statistics.pkl') try: if tf.io.gfile.exists(dataset_stats_file): with tf.io.gfile.GFile(dataset_stats_file, 'rb') as f: DATASET_STATS = pickle.load(f) except Exception as err: print('Loading dataset statistics from pickle failed: {}.'.format(err)) # Parse gin config, with gin.unlock_config(): gin.parse_config_file(gin_file, skip_unknown=True) # Assumes only one checkpoint in the folder, 'ckpt-[iter]`. ckpt_files = [f for f in tf.io.gfile.listdir(model_dir) if 'ckpt' in f] ckpt_name = ckpt_files[0].split('.')[0] ckpt = os.path.join(model_dir, ckpt_name) # Ensure dimensions and sampling rates are equal time_steps_train = gin.query_parameter('DefaultPreprocessor.time_steps') n_samples_train = gin.query_parameter('Additive.n_samples') hop_size = int(n_samples_train / time_steps_train) time_steps = int(audio.shape[1] / hop_size) n_samples = time_steps * hop_size gin_params = [ 'Additive.n_samples = {}'.format(n_samples), 'FilteredNoise.n_samples = {}'.format(n_samples), 'DefaultPreprocessor.time_steps = {}'.format(time_steps), 'oscillator_bank.use_angular_cumsum = True', # Avoids cumsum accumulation errors. ] with gin.unlock_config(): gin.parse_config(gin_params) # Trim all input vectors to correct lengths for key in ['f0_hz', 'f0_confidence', 'loudness_db']: audio_features[key] = audio_features[key][:time_steps] audio_features['audio'] = audio_features['audio'][:, :n_samples] # Set up the model just to predict audio given new conditioning model = ddsp.training.models.Autoencoder() model.restore(ckpt) # Build model by running a batch through it. start_time = time.time() _ = model(audio_features, training=False) threshold = 1 ADJUST = True quiet = 20 autotune = 0 pitch_shift = -1 loudness_shift = 3 audio_features_mod = {k: v.copy() for k, v in audio_features.items()} ## Helper functions. def shift_ld(audio_features, ld_shift=0.0): """Shift loudness by a number of ocatves.""" audio_features['loudness_db'] += ld_shift return audio_features def shift_f0(audio_features, pitch_shift=0.0): """Shift f0 by a number of ocatves.""" audio_features['f0_hz'] *= 2.0 ** (pitch_shift) audio_features['f0_hz'] = np.clip(audio_features['f0_hz'], 0.0, librosa.midi_to_hz(110.0)) return audio_features mask_on = None if ADJUST and DATASET_STATS is not None: # Detect sections that are "on". mask_on, note_on_value = detect_notes(audio_features['loudness_db'], audio_features['f0_confidence'], threshold) if np.any(mask_on): # Shift the pitch register. target_mean_pitch = DATASET_STATS['mean_pitch'] pitch = ddsp.core.hz_to_midi(audio_features['f0_hz']) mean_pitch = np.mean(pitch[mask_on]) p_diff = target_mean_pitch - mean_pitch p_diff_octave = p_diff / 12.0 round_fn = np.floor if p_diff_octave > 1.5 else np.ceil p_diff_octave = round_fn(p_diff_octave) audio_features_mod = shift_f0(audio_features_mod, p_diff_octave) # Quantile shift the note_on parts. _, loudness_norm = colab_utils.fit_quantile_transform( audio_features['loudness_db'], mask_on, inv_quantile=DATASET_STATS['quantile_transform']) # Turn down the note_off parts. mask_off = np.logical_not(mask_on) loudness_norm[mask_off] -= quiet * (1.0 - note_on_value[mask_off][:, np.newaxis]) loudness_norm = np.reshape(loudness_norm, audio_features['loudness_db'].shape) audio_features_mod['loudness_db'] = loudness_norm # Auto-tune. if autotune: f0_midi = np.array(ddsp.core.hz_to_midi(audio_features_mod['f0_hz'])) tuning_factor = get_tuning_factor(f0_midi, audio_features_mod['f0_confidence'], mask_on) f0_midi_at = auto_tune(f0_midi, tuning_factor, mask_on, amount=autotune) audio_features_mod['f0_hz'] = ddsp.core.midi_to_hz(f0_midi_at) else: print('\nSkipping auto-adjust (no notes detected or ADJUST box empty).') else: print('\nSkipping auto-adujst (box not checked or no dataset statistics found).') # Manual Shifts. audio_features_mod = shift_ld(audio_features_mod, loudness_shift) audio_features_mod = shift_f0(audio_features_mod, pitch_shift) af = audio_features if audio_features_mod is None else audio_features_mod return model(af, training=False)