def convert_audio_to_stft(src_dir, dest_dir, extension): """ Function used to convert audio clips into Short-Time Fourier Transform matrices, and save matrices to files. :param src_dir: input audio directory :param dest_dir: output STFT directory :param extension: desired output file type """ paths = prep_utils.get_unprocessed_items(src_dir=src_dir, dest_dir=dest_dir) start_time = time.time() for path in paths: prep_utils.display_progress_eta(current_item=path, total_items=paths, start_time=start_time) y, sr = librosa.load(path) # Decompose a spectrogram with NMF D = librosa.stft(y) # Separate the magnitude and phase and only use magnitude S, _ = librosa.magphase(D) out = dest_dir + prep_utils.get_filename(path) + extension np.save(out, S)
def make_audio_chunks(seconds, dest_dir): """ Function used to convert audio into shorter audio clips, and save audio clips to files. :param seconds: desired clip length :param dest_dir: output directory """ paths = prep_utils.get_absolute_file_paths(DATASET_DIR, ".wav") start_time = time.time() for audio_path in paths: prep_utils.display_progress_eta(current_item=audio_path, total_items=paths, start_time=start_time) audio = AudioSegment.from_file(audio_path) chunk_length_ms = seconds * 1000 # 20 seconds chunks = make_chunks(audio, chunk_length_ms) chunks.pop(-1) # Export all of the individual chunks as wav files for i, chunk in enumerate(chunks): _, chunk_name = os.path.split( os.path.splitext(audio_path)[0] + "_chunk_{0}.wav".format(i)) chunk.export(dest_dir + chunk_name, format="wav") print("\n\nChunks export completed.")
def convert_stft_to_images_grayscale(src_dir, dest_dir, ext=".png", size=None): """ Convert STFT matrices into grayscale images :param src_dir: source directory that stores STFT matrices :param dest_dir: destination where converted images are being saved :param ext: image extension :param size: resize dimension """ paths = prep_utils.get_unprocessed_items(src_dir=src_dir, dest_dir=dest_dir) start_time = time.time() for path in paths: prep_utils.display_progress_eta(current_item=path, total_items=paths, start_time=start_time) S = np.load(path) S_scaled = prep_utils.increase_brightness(S) if size: S_scaled = cv2.resize(S_scaled, (size, size), interpolation=cv2.INTER_CUBIC) out_path = dest_dir + prep_utils.get_filename(path) + ext cv2.imwrite(out_path, S_scaled)
def convert_stft_to_images(src_dir, dest_dir, ext=".png", size=None): """ Function used to convert STFT matrices to images, and saves them to destination folder :param src_dir: source folder where STFT matrices are stored :param dest_dir: output images folder :param ext: image format, defaulted to .png :param size: dimension of desired square image """ paths = prep_utils.get_unprocessed_items(src_dir=src_dir, dest_dir=dest_dir) start_time = time.time() for path in paths: prep_utils.display_progress_eta(current_item=path, total_items=paths, start_time=start_time) S_norm = np.load(path) S_norm = normalize_stft(S_norm) if size: S_norm = cv2.resize(S_norm, (size, size), interpolation=cv2.INTER_CUBIC) out_path = dest_dir + prep_utils.get_filename(path) + ext plt.imsave(out_path, S_norm) image = cv2.imread(out_path) cv2.imwrite(out_path, image)
def audio_reconstruction_test(src_dir, dest_dir, ext=".png", size=None): """ Test different approaches to image to audio conversion :param src_dir: image directory :param dest_dir: audio directory :param ext: image type :param size: desired dimension for resizing """ paths = prep_utils.get_unprocessed_items(src_dir=src_dir, dest_dir=dest_dir) start_time = time.time() for path in paths: prep_utils.display_progress_eta(current_item=path, total_items=paths, start_time=start_time) S = np.load(path) cv2.imshow("image", S) cv2.waitKey(0) pd.DataFrame(S).to_csv(dest_dir + "S.csv", header=None, index=False) S_scaled = prep_utils.increase_brightness(S) cv2.imshow("image", S_scaled) cv2.waitKey(0) pd.DataFrame(S_scaled).to_csv(dest_dir + "S_scaled.csv", header=None, index=False) out_path = dest_dir + "gray" + ext cv2.imwrite(out_path, S_scaled) S = cv2.imread(out_path, 0) S = np.array(S, dtype=np.float32) S_recovered = S if size: S_recovered = cv2.resize(S_recovered, (size, size), interpolation=cv2.INTER_CUBIC) pd.DataFrame(S_recovered).to_csv(dest_dir + "S_recovered.csv", header=None, index=False) out_path = dest_dir + "resized" + ext cv2.imwrite(out_path, S_recovered) S_audio = np.genfromtxt(dest_dir + "S_recovered.csv", delimiter=',') S_audio = np.array(S_audio, dtype=np.float32) S_audio = cv2.resize(S_audio, (431, 1025), interpolation=cv2.INTER_CUBIC) y = librosa.griffinlim(S_audio) out = dest_dir + "s.wav" # Save reconstructed data scipy.io.wavfile.write(out, 22050, y) break
def audio_reconstruction_stylegan(src_dir, dest_dir, resize_h, resize_w, mode="RGB"): """ Image to Audio reconstruction post StyleGAN image generation. :param src_dir: directory of fake images generated by StyleGAN :param dest_dir: destination directory where converted audio will be saved :param resize_h: height of the desired image dimension :param resize_w: width of the desired image dimension :param mode: "RGB" or "grayscale", generated image type by StyleGAN """ src_dir, sub_dir = ar_utils.select_images_iteration(directory=src_dir) paths = prep_utils.get_absolute_file_paths(src_dir) out_dir = dest_dir + sub_dir if not os.path.exists(out_dir): os.makedirs(out_dir) start_time = time.time() for path in paths: prep_utils.display_progress_eta(current_item=path, total_items=paths, start_time=start_time) out_path = out_dir + prep_utils.get_filename(path) if mode == "RGB": image = cv2.imread(path) image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) S_recovered = np.array(image_gray, dtype=np.float32) S_recovered = cv2.resize(S_recovered, (resize_w, resize_h), interpolation=cv2.INTER_CUBIC) S = (S_recovered - np.min(S_recovered)) / ( np.max(S_recovered) - np.min(S_recovered)) * 2 - 1 pd.DataFrame(S).to_csv(out_dir + prep_utils.get_filename(path) + "_norm.csv", header=None, index=False) plt.imsave(out_dir + prep_utils.get_filename(path) + "_norm.png", S) S = ar_utils.unnormalize_stft(s=S) pd.DataFrame(S).to_csv(out_dir + prep_utils.get_filename(path) + "_reconstruct.csv", header=None, index=False) plt.imsave( out_dir + prep_utils.get_filename(path) + "_reconstruct.png", S) y = librosa.griffinlim(S) out = out_dir + prep_utils.get_filename(path) + ".wav" scipy.io.wavfile.write(out, 22050, y) rate, data = scipy.io.wavfile.read(out) reduced_noise = nr.reduce_noise(audio_clip=data, noise_clip=data, verbose=False) out = out_dir + prep_utils.get_filename(path) + "_nr.wav" sf.write(out, reduced_noise, rate) elif mode == "grayscale": S = cv2.imread(path, cv2.IMREAD_GRAYSCALE) S = np.array(S, dtype=np.float32) S_recovered = S pd.DataFrame(S_recovered).to_csv(out_path + "_original.csv", header=None, index=False) cv2.imwrite(out_path + "_original.png", S_recovered) S_recovered = cv2.resize(S_recovered, (resize_w, resize_h), interpolation=cv2.INTER_CUBIC) S_recovered = ar_utils.decrease_brightness(S_recovered) pd.DataFrame(S_recovered).to_csv(out_path + "_recovered.csv", header=None, index=False) cv2.imwrite(out_path + "_recovered.png", S_recovered) y = librosa.griffinlim(S_recovered) out = out_dir + prep_utils.get_filename(path) + "_recovered.wav" scipy.io.wavfile.write(out, 22050, y) rate, data = scipy.io.wavfile.read(out) reduced_noise = nr.reduce_noise(audio_clip=data, noise_clip=data, verbose=False) out = out_dir + prep_utils.get_filename(path) + "_nr.wav" sf.write(out, reduced_noise, rate)