def display_spectrogram(): """ Function used to generate and display sample spectrogram from audio files. """ paths = prep_utils.get_absolute_file_paths(AUDIO_CHUNKS_20S_DIR)[:3] for path in paths: y, sr = librosa.load(path) # Decompose a spectrogram with NMF # Short-time Fourier transform underlies most analysis. # librosa.stft returns a complex matrix D. # D[f, t] is the FFT value at frequency f, time (frame) t. D = librosa.stft(y) # Separate the magnitude and phase and only use magnitude S, phase = librosa.magphase(D) print("S Shape: ", S.shape) melspec_log = librosa.feature.melspectrogram(S=np.log(S), sr=sr) print("MelSpec Shape: ", melspec_log.shape) plt.figure() librosa.display.specshow(melspec_log, y_axis='mel', x_axis='time') plt.colorbar() plt.show()
def make_audio_chunks(seconds, dest_dir): """ Function used to convert audio into shorter audio clips, and save audio clips to files. :param seconds: desired clip length :param dest_dir: output directory """ paths = prep_utils.get_absolute_file_paths(DATASET_DIR, ".wav") start_time = time.time() for audio_path in paths: prep_utils.display_progress_eta(current_item=audio_path, total_items=paths, start_time=start_time) audio = AudioSegment.from_file(audio_path) chunk_length_ms = seconds * 1000 # 20 seconds chunks = make_chunks(audio, chunk_length_ms) chunks.pop(-1) # Export all of the individual chunks as wav files for i, chunk in enumerate(chunks): _, chunk_name = os.path.split( os.path.splitext(audio_path)[0] + "_chunk_{0}.wav".format(i)) chunk.export(dest_dir + chunk_name, format="wav") print("\n\nChunks export completed.")
def downsample(): """ Downsample and resize to 256x256, and save them locally """ paths = prep_utils.get_absolute_file_paths(PROCESSED_STFT_DIR) for path in paths: S = np.load(path) S_downsample = skimage.transform.resize(S, (256, 256), anti_aliasing=True) out = RESIZED_STFT_DIR + prep_utils.get_filename(path) + ".npy" np.save(out, S_downsample)
def audio_reconstruction(): """ Function used to reconstruct sample audio clips from STFT matrices, and save audio to file. """ paths = prep_utils.get_absolute_file_paths(STFT_ARRAY_DIR) for path in paths: S = np.load(path) y = librosa.griffinlim(S) out = AUDIO_OUT_DIR + prep_utils.get_filename(path) + ".wav" # Save reconstructed data scipy.io.wavfile.write(out, 22050, y)
def record_mean_std(): """ Record mean and std of all STFT matrices and save them locally """ paths = prep_utils.get_absolute_file_paths(STFT_ARRAY_DIR) mean_list = [] std_list = [] for path in paths: S = np.load(path) S = np.log(S) mag_mean = np.mean(S) mag_std = np.std(S) mean_list.append(mag_mean) std_list.append(mag_std) print("Finished:", path) data = {"mean": mean_list, "std": std_list, "path": paths} df = pd.DataFrame.from_dict(data) df.to_csv("./data/saved_mean_std.csv")
def audio_reconstruction_stylegan(src_dir, dest_dir, resize_h, resize_w, mode="RGB"): """ Image to Audio reconstruction post StyleGAN image generation. :param src_dir: directory of fake images generated by StyleGAN :param dest_dir: destination directory where converted audio will be saved :param resize_h: height of the desired image dimension :param resize_w: width of the desired image dimension :param mode: "RGB" or "grayscale", generated image type by StyleGAN """ src_dir, sub_dir = ar_utils.select_images_iteration(directory=src_dir) paths = prep_utils.get_absolute_file_paths(src_dir) out_dir = dest_dir + sub_dir if not os.path.exists(out_dir): os.makedirs(out_dir) start_time = time.time() for path in paths: prep_utils.display_progress_eta(current_item=path, total_items=paths, start_time=start_time) out_path = out_dir + prep_utils.get_filename(path) if mode == "RGB": image = cv2.imread(path) image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) S_recovered = np.array(image_gray, dtype=np.float32) S_recovered = cv2.resize(S_recovered, (resize_w, resize_h), interpolation=cv2.INTER_CUBIC) S = (S_recovered - np.min(S_recovered)) / ( np.max(S_recovered) - np.min(S_recovered)) * 2 - 1 pd.DataFrame(S).to_csv(out_dir + prep_utils.get_filename(path) + "_norm.csv", header=None, index=False) plt.imsave(out_dir + prep_utils.get_filename(path) + "_norm.png", S) S = ar_utils.unnormalize_stft(s=S) pd.DataFrame(S).to_csv(out_dir + prep_utils.get_filename(path) + "_reconstruct.csv", header=None, index=False) plt.imsave( out_dir + prep_utils.get_filename(path) + "_reconstruct.png", S) y = librosa.griffinlim(S) out = out_dir + prep_utils.get_filename(path) + ".wav" scipy.io.wavfile.write(out, 22050, y) rate, data = scipy.io.wavfile.read(out) reduced_noise = nr.reduce_noise(audio_clip=data, noise_clip=data, verbose=False) out = out_dir + prep_utils.get_filename(path) + "_nr.wav" sf.write(out, reduced_noise, rate) elif mode == "grayscale": S = cv2.imread(path, cv2.IMREAD_GRAYSCALE) S = np.array(S, dtype=np.float32) S_recovered = S pd.DataFrame(S_recovered).to_csv(out_path + "_original.csv", header=None, index=False) cv2.imwrite(out_path + "_original.png", S_recovered) S_recovered = cv2.resize(S_recovered, (resize_w, resize_h), interpolation=cv2.INTER_CUBIC) S_recovered = ar_utils.decrease_brightness(S_recovered) pd.DataFrame(S_recovered).to_csv(out_path + "_recovered.csv", header=None, index=False) cv2.imwrite(out_path + "_recovered.png", S_recovered) y = librosa.griffinlim(S_recovered) out = out_dir + prep_utils.get_filename(path) + "_recovered.wav" scipy.io.wavfile.write(out, 22050, y) rate, data = scipy.io.wavfile.read(out) reduced_noise = nr.reduce_noise(audio_clip=data, noise_clip=data, verbose=False) out = out_dir + prep_utils.get_filename(path) + "_nr.wav" sf.write(out, reduced_noise, rate)