def main(_): # REVIEW josephz: This paradigm was copied from inference-hack.py # initialize_globals() sample_dir = "sample" # sample_names = ["new_test"] sample_names = ["rolling_in_the_deep"] post_processor = PostProcessor() post_processor.load_weights("weights.h5") # sample_names = ["perfect_features"] # sample_names = ["rolling_in_the_one_more_time"] for sample_name in sample_names: console.h1("Processing %s" % sample_name) console.time("total processing for " + sample_name) sample_path = sample_dir + "/" + sample_name style_path = sample_path + "/style.mp3" content_path = sample_path + "/content.mp3" stylized_img_path = sample_path + "/stylized.png" stylized_img_raw_path = sample_path + "/stylized_raw.png" stylized_audio_path = sample_path + "/stylized.mp3" stylized_audio_raw_path = sample_path + "/stylized_raw.mp3" # Read style audio to spectrograms. style_audio, style_sample_rate = conversion.file_to_audio(style_path) style_img, style_phase = conversion.audio_to_spectrogram( style_audio, fft_window_size=1536) # Read content audio to spectrograms. content_audio, content_sample_rate = conversion.file_to_audio( content_path) content_img, content_phase = conversion.audio_to_spectrogram( content_audio, fft_window_size=1536) stylized_img_raw, stylized_img = stylize(content_img, style_img, content_phase, style_phase, content_path, style_path, post_processor) # Save raw stylized spectrogram and audio. stylized_audio_raw = conversion.amplitude_to_audio( stylized_img_raw, fft_window_size=1536, phase_iterations=15, phase=content_phase) conversion.image_to_file(stylized_img_raw, stylized_img_raw_path) conversion.audio_to_file(stylized_audio_raw, stylized_audio_raw_path) # Save post-processed stylized spectrogram and audio. stylized_audio = conversion.amplitude_to_audio(stylized_img, fft_window_size=1536, phase_iterations=15, phase=content_phase) # np.save("stylized_img.npy", stylized_img) # np.save("content_phase.npy", content_phase) conversion.image_to_file(stylized_img, stylized_img_path) conversion.audio_to_file(stylized_audio, stylized_audio_path) console.timeEnd("total processing for " + sample_name) console.info("Finished processing %s; saved to %s" % (sample_name, stylized_audio_path))
def extract_fundamental(amplitude): fundamental = np.zeros(amplitude.shape) # TODO: replace all of this with real code or at least clean it up # it should just be one big numpy thingy f_band_min = -4 f_band_max = 8 f_band_len = f_band_max - f_band_min f_band_coeffs = (1 - np.concatenate( (np.array(range(f_band_min, 0)) / f_band_min, np.array(range(f_band_max)) / f_band_max)))[:, np.newaxis] peak_finder = np.array([-0.5, -0.5, 2, -0.5, -0.5])[:, np.newaxis].T console.time("big loop") freqs = np.argmax(np.mean(amplitude[:50], axis=2), axis=0) # console.stats(freqs) for t in range(amplitude.shape[1]): f = freqs[t] # handle case where 2nd harmonic > first if np.mean(amplitude[f // 2, t]) > 0.4 * np.mean(amplitude[f, t]): f = f // 2 freqs[t] = f if f > 5: f_min = f + f_band_min f_max = f + f_band_max fundamental[f_min:f_max, t] = f_band_coeffs * amplitude[f_min:f_max, t] console.timeEnd("big loop") console.time("remove dots") mask = (grey_dilation(grey_erosion(fundamental, structure=np.ones((3, 5, 1))), structure=np.ones((6, 12, 1))) > 0.1) console.timeEnd("remove dots") fundamental *= mask return fundamental
def stylize(content, style, content_phase, style_phase, content_path, style_path, post_processor): stylized = content # Pitch fundamental extraction console.time("extracting fundamentals") content_fundamental_mask = extract_fundamental(content) style_fundamental_mask = extract_fundamental(style) console.timeEnd("extracting fundamentals") console.time("fundamental freqs and amps") content_fundamental_freqs, content_fundamental_amps = extract_fundamental_freqs_amps( content_fundamental_mask, content) style_fundamental_freqs, style_fundamental_amps = extract_fundamental_freqs_amps( style_fundamental_mask, style) console.timeEnd("fundamental freqs and amps") if True: console.time("pitch normalization") content_normalized, content_normalized_phase = normalize_pitch( content, content_phase, content_fundamental_freqs, content_fundamental_amps, base_pitch=32) style_normalized, style_normalized_phase = normalize_pitch( style, style_phase, style_fundamental_freqs, style_fundamental_amps, base_pitch=32) content_normalized_path = content_path + ".normalized.mp3" content_normalized_audio = conversion.amplitude_to_audio( content_normalized, fft_window_size=1536, phase_iterations=1, phase=content_normalized_phase) conversion.audio_to_file(content_normalized_audio, content_normalized_path) style_normalized_path = style_path + ".normalized.mp3" style_normalized_audio = conversion.amplitude_to_audio( style_normalized, fft_window_size=1536, phase_iterations=1, phase=style_normalized_phase) conversion.audio_to_file(style_normalized_audio, style_normalized_path) console.timeEnd("pitch normalization") # Featurization use_spectral_features = False if use_spectral_features: # Pitch normalization content_features = compute_features(content) style_features = compute_features(style) if not use_spectral_features: # neural features content_features = get_feature_array(content_path) content_features /= content_features.max() #console.stats(content_features, "content features") # conversion.image_to_file(content_features[:,:,np.newaxis], "content_features.png") #console.debug(content.shape, "content.shape") content_features = resize( content_features, (content_features.shape[0], content.shape[1])) style_features = get_feature_array(style_path) style_features /= style_features.max() #console.stats(style_features, "style features") #console.debug(style.shape, "style.shape") # conversion.image_to_file(style_features[:,:,np.newaxis], "style_features.png") style_features = resize(style_features, (style_features.shape[0], style.shape[1])) # Harmonic recovery content_harmonics = fundamental_to_harmonics(content_fundamental_freqs, content_fundamental_amps, content) content_harmonics = grey_dilation(content_harmonics, size=3) content_harmonics *= content.max() / content_harmonics.max() # Sibilant recovery content_sibilants = get_sibilants(content, content_fundamental_amps) content_sibilants *= content.max() / content_sibilants.max() # Patchmatch console.time("patch match") if False: stylized = audio_patch_rescale( content, style, content_fundamental_freqs, style_fundamental_freqs, content_features, style_features, content_harmonics, content_sibilants, ) if True: stylized = audio_patch_match(content, style, content_fundamental_freqs, style_fundamental_freqs, content_features, style_features, iterations=96) console.timeEnd("patch match") console.log("normal stylized has shape", stylized.shape) # ipdb.set_trace() stylized_post_processed = post_processor.predict_unstacked( amplitude=np.mean(stylized, axis=2), harmonics=np.mean(content_harmonics, axis=2), sibilants=np.mean(content_sibilants, axis=2)) stylized_post_processed = np.dstack([ stylized_post_processed, stylized_post_processed ]) # TODO: actually run the network on both channels instead of doing this stylized_post_processed = global_eq_match(stylized_post_processed, style) return stylized, stylized_post_processed
import sst import ipdb test_files = [ "../data/aligned/one_last_time/one_last_time_cover_aligned_30s.mp3", "../data/aligned/one_last_time/one_last_time_original_30s.mp3" ] #test_files = ["sample/rolling_in_the_deep/style.mp3"] for f in test_files: console.time("preprocessing") console.log("starting", f) audio, sample_rate = conversion.file_to_audio(f) amplitude, phase = conversion.audio_to_spectrogram(audio, fft_window_size=1536) console.timeEnd("preprocessing") console.time("extracting fundamental") fundamental_mask = sst.extract_fundamental(amplitude) console.timeEnd("extracting fundamental") conversion.image_to_file(fundamental_mask, f + ".fundamental.png") console.time("fundamental to harmonics") fundamental_freqs, fundamental_amps = sst.extract_fundamental_freqs_amps( fundamental_mask, amplitude) harmonics = sst.fundamental_to_harmonics(fundamental_freqs, fundamental_amps, amplitude) console.timeEnd("fundamental to harmonics") conversion.image_to_file(harmonics, f + ".harmonics.png") # pitch normalization haha if True:
np.random.shuffle(self.pairs) console.log("Loaded", len(self.pairs), "pairs") console.log("Shape of first pair [", self.pairs[0][1], "] is", self.pairs[0][0].shape, self.pairs[0][2].shape) def on_epoch_end(self): np.random.shuffle(self.pairs) def __len__(self): return 64 def __getitem__(self, index): max_index = int(np.floor(len(self.pairs) / self.batch_size)) index %= max_index x = [] y = [] style = [] for b in range(self.batch_size): x_i, file_name, y_i, f_min, f_max = self.pairs[index * self.batch_size + b] x.append(x_i) y.append(y_i) style.append(self.style_inputs[file_name][:, f_min:f_max]) return [np.array(x), np.array(style)], np.array(y) if __name__ == "__main__": console.time("loading all data") d = DataGenerator() console.timeEnd("loading all data")
# a test of what we could get if we perfectly matched each element of style test_content_file = "sample/rolling_in_the_deep/content.mp3" test_style_file = "sample/rolling_in_the_deep/reference_stylized.mp3" # Load them both as spectrograms console.time("preprocessing") content_audio, content_sample_rate = conversion.file_to_audio( test_content_file) content_amplitude, content_phase = conversion.audio_to_spectrogram( content_audio, fft_window_size=1536) style_audio, style_sample_rate = conversion.file_to_audio(test_style_file) style_amplitude, style_phase = conversion.audio_to_spectrogram( style_audio, fft_window_size=1536) console.timeEnd("preprocessing") stylized_amplitude = np.zeros(content_amplitude.shape) num_freq, num_timesteps, _ = content_amplitude.shape num_timesteps = min(num_timesteps, style_amplitude.shape[1]) # Preprocessing - compute fundamentals and harmonics console.time("super resolution") content_fundamental_mask = sst.extract_fundamental(content_amplitude) content_fundamental_freqs, content_fundamental_amps = sst.extract_fundamental_freqs_amps( content_fundamental_mask, content_amplitude) content_sibilants = sst.get_sibilants(content_amplitude, content_fundamental_amps) conversion.image_to_file(content_sibilants, test_content_file + ".sibilants.jpg")