def read_audio(fp, downsample=True): sig, sr = torchaudio.load(fp) if downsample: # 48khz -> 16 khz if sig.size(0) % 3 == 0: sig = sig[::3].contiguous() else: sig = sig[:-(sig.size(0) % 3):3].contiguous() return sig, sr
def load_audio(path): sound, _ = torchaudio.load(path) sound = sound.numpy() if len(sound.shape) > 1: if sound.shape[1] == 1: sound = sound.squeeze() else: sound = sound.mean(axis=1) # multiple channels, average return sound
def load_audio(path): '''使用torchaudio读取音频 Args: path(string) : 音频的路径 Returns: sound(numpy.ndarray) : 单声道音频数据,如果是多声道进行平均(Samples * 1 channel) ''' sound, _ = torchaudio.load(path) sound = sound.numpy() if len(sound.shape) > 1: if sound.shape[1] == 1: sound = sound.squeeze() else: sound = sound.mean(axis = 1) return sound
def load_audio(path): ''' Input: path : string 载入音频的路径 Output: sound : numpy.ndarray 单声道音频数据,如果是多声道进行平均 ''' sound, _ = torchaudio.load(path) sound = sound.numpy() if len(sound.shape) > 1: if sound.shape[1] == 1: sound = sound.squeeze() else: sound - sound.mean(axis=1) return sound
def load_audio(path): """ Args: path : string 载入音频的路径 Returns: sound : numpy.ndarray 单声道音频数据,如果是多声道进行平均 """ sound, _ = torchaudio.load(path) sound = sound.numpy() if len(sound.shape) > 1: if sound.shape[1] == 1: sound = sound.squeeze() else: sound = sound.mean(axis=1) return sound
def load_wave(path, normalize=True): """ Args: path : string 载入音频的路径 Returns: """ sound, _ = torchaudio.load(path) sound = sound.numpy() if len(sound.shape) > 1: if sound.shape[1] == 1: sound = sound.squeeze() else: sound = sound.mean(axis=1) wave = torch.FloatTensor(sound) if normalize: mean = wave.mean() std = wave.std() wave.add_(-mean) wave.div_(std) return wave
def download_vctk(destination, tmp_dir=None, device="cpu"): """Download dataset and perform resample to 16000 Hz. Arguments --------- destination : str Place to put final zipped dataset. tmp_dir : str Location to store temporary files. Will use `tempfile` if not provided. device : str Passed directly to pytorch's ``.to()`` method. Used for resampling. """ dataset_name = "noisy-vctk-16k" if tmp_dir is None: tmp_dir = tempfile.gettempdir() final_dir = os.path.join(tmp_dir, dataset_name) if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir) if not os.path.isdir(final_dir): os.mkdir(final_dir) prefix = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/2791/" noisy_vctk_urls = [ prefix + "clean_testset_wav.zip", prefix + "noisy_testset_wav.zip", prefix + "testset_txt.zip", prefix + "clean_trainset_28spk_wav.zip", prefix + "noisy_trainset_28spk_wav.zip", prefix + "trainset_28spk_txt.zip", ] zip_files = [] for url in noisy_vctk_urls: filename = os.path.join(tmp_dir, url.split("/")[-1]) zip_files.append(filename) if not os.path.isfile(filename): logger.info("Downloading " + url) with urllib.request.urlopen(url) as response: with open(filename, "wb") as tmp_file: logger.info("... to " + tmp_file.name) shutil.copyfileobj(response, tmp_file) # Unzip for zip_file in zip_files: logger.info("Unzipping " + zip_file) shutil.unpack_archive(zip_file, tmp_dir, "zip") os.remove(zip_file) # Move transcripts to final dir shutil.move(os.path.join(tmp_dir, "testset_txt"), final_dir) shutil.move(os.path.join(tmp_dir, "trainset_28spk_txt"), final_dir) # Downsample dirs = [ "noisy_testset_wav", "clean_testset_wav", "noisy_trainset_28spk_wav", "clean_trainset_28spk_wav", ] downsampler = Resample(orig_freq=48000, new_freq=16000) for directory in dirs: logger.info("Resampling " + directory) dirname = os.path.join(tmp_dir, directory) # Make directory to store downsampled files dirname_16k = os.path.join(final_dir, directory + "_16k") if not os.path.isdir(dirname_16k): os.mkdir(dirname_16k) # Load files and downsample for filename in get_all_files(dirname, match_and=[".wav"]): signal, rate = torchaudio.load(filename) downsampled_signal = downsampler(signal.view(1, -1).to(device)) # Save downsampled file torchaudio.save( os.path.join(dirname_16k, filename[-12:]), downsampled_signal[0].cpu(), sample_rate=16000, channels_first=False, ) # Remove old file os.remove(filename) # Remove old directory os.rmdir(dirname) logger.info("Zipping " + final_dir) final_zip = shutil.make_archive( base_name=final_dir, format="zip", root_dir=os.path.dirname(final_dir), base_dir=os.path.basename(final_dir), ) logger.info(f"Moving {final_zip} to {destination}") shutil.move(final_zip, os.path.join(destination, dataset_name + ".zip"))
if char == '<s>': prev = '' continue hypothesis += char prev = char return hypothesis.replace('|', ' ') # Load Wav2Vec2 pretrained model from Hugging Face Hub model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") # Convert the model to torchaudio format, which supports TorchScript. model = import_huggingface_model(model) # Remove weight normalization which is not supported by quantization. model.encoder.transformer.pos_conv_embed.__prepare_scriptable__() model = model.eval() # Attach decoder model = SpeechRecognizer(model) # Apply quantization / script / optimize for motbile quantized_model = torch.quantization.quantize_dynamic( model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8) scripted_model = torch.jit.script(quantized_model) optimized_model = optimize_for_mobile(scripted_model) # Sanity check waveform , _ = torchaudio.load('scent_of_a_woman_future.wav') print('Result:', optimized_model(waveform)) optimized_model._save_for_lite_interpreter("wav2vec2.ptl")
def test_Vol(self): test_filepath = common_utils.get_asset_path( 'steam-train-whistle-daniel_simon.wav') waveform, _ = torchaudio.load(test_filepath) self._assert_consistency(T.Vol(1.1), waveform)
def _reload_signal(self): data_signal, sample_rate = torchaudio.load(self.path) self._sample_rate = sample_rate self._data_signal = data_signal
def __call__( self, file: AudioFile, sample_offset: int = 0, num_samples: int = None ) -> Tensor: """ Parameters ---------- file : AudioFile Audio file. sample_offset : int, optional Start loading at this `sample_offset` sample. Defaults ot 0. num_samples : int, optional Load that many samples. Defaults to load up to the end of the file. Returns ------- samples : (time, channel) torch.Tensor Samples """ self.is_valid(file) original_samples = None if isinstance(file, dict): # file = {"samples": torch.Tensor, "sample_rate": int, [ "channel": int ]} if "samples" in file: original_samples = file["samples"] original_sample_rate = file["sample_rate"] original_total_num_samples = original_samples.shape[1] channel = file.get("channel", None) # file = {"audio": str or Path, [ "channel": int ]} else: audio_path = str(file["audio"]) ( original_total_num_samples, original_sample_rate, ) = self.get_audio_metadata(audio_path) channel = file.get("channel", None) # file = str or Path else: audio_path = str(file) original_total_num_samples, original_sample_rate = self.get_audio_metadata( audio_path ) channel = None original_sample_offset = round( sample_offset * original_sample_rate / self.sample_rate ) if num_samples is None: original_num_samples = original_total_num_samples - original_sample_offset else: original_num_samples = round( num_samples * original_sample_rate / self.sample_rate ) if original_sample_offset + original_num_samples > original_total_num_samples: raise ValueError() if original_samples is None: try: original_data, _ = torchaudio.load( audio_path, frame_offset=original_sample_offset, num_frames=original_num_samples, ) except TypeError: raise Exception( "It looks like you are using an unsupported version of torchaudio." " If you have 0.6 or older, please upgrade to a newer version." ) else: original_data = original_samples[ :, original_sample_offset : original_sample_offset + original_num_samples ] if channel is not None: original_data = original_data[channel - 1 : channel, :] result = self.downmix_and_resample(original_data, original_sample_rate) if num_samples is not None: # If there is an off-by-one error in the length (e.g. due to resampling), fix it. if result.shape[-1] > num_samples: result = result[:, :num_samples] elif result.shape[-1] < num_samples: diff = num_samples - result.shape[-1] result = torch.nn.functional.pad(result, (0, diff)) return result
def load_wav(path): signal, _ = torchaudio.load(path) signal = signal.reshape(-1) return signal
lines = f.readlines() data = [(line.strip().split(',')[0], int(line.strip().split(',')[1])) for line in lines] model.eval() with torch.no_grad(): all_labels = [] all_pred_labels = [] small_audios_indices = [] audio_lengths = [] wrong_sample_rate_indices = [] clean_indices = [] total_mag = [] #for batch_num, (audio, label) in enumerate(loader): for i, (path, label) in enumerate(data): audio, sample_rate = torchaudio.load('/storage' + path) if sample_rate != 8000: wrong_sample_rate_indices.append(i) continue if model.cnn_type == 'vgg': min_len = 10100 else: min_len = 12300 if audio.shape[1] < min_len: #print("Audio at index {} is too small to pass through CNN".format(i)) small_audios_indices.append(i) continue if audio.shape[1] > 80000: audio = audio[:, :80000] clean_indices.append(i) total_mag.append(
import torch import torchaudio import pytorch_lightning as pl import matplotlib.pyplot as plt import pandas as pd import pathlib as Path import os torch.cuda.is_available() print(os.getcwd()) csv = pd.read_csv('./data/ESC-50-master/meta/esc50.csv') x, sr = torchaudio.load(f'./data/ESC-50-master/audio/{csv.iloc[0, 0]}') h = torchaudio.transforms.Resample(new_freq=8000)(x) print(h.shape) plt.imshow(h[0])
def getDFTFeature(filepath, win_size=1024, win_shift=512, preemphasis=False, channel_first=True, drop_dc=True, cut_len=5160, normalize=False): ''' 获取一个音频的对数DFT频谱 Args: filepath: 音频路径 win_size: 窗口大小(点) win_shift: 滑动距离(点) preemphasis: 是否预强化。通过一阶差分弱低频强高频 channel_first: whether to put channels in first dimension drop_dc: whether to drop DC component in spectrum (frequency==0) cut_len: keep the fix number of points in time axis normalize: 观察发现能量很小,对数能量在-100的数量级,有必要处理一下 Return: (log_power_spectrum, phase_spectrum):能量谱与相位谱相叠形成的tensor 大小原本为(2C,T,M//2),C为通道数据,T为帧数,M为FFT点数 经转置后变为(T,M//2,2C) ''' waveform, sample_freq = torchaudio.load(filepath) m, n = waveform.shape # padding to 2^k if (n - win_size) % win_shift != 0: waveform = torch.cat( [waveform, torch.zeros(m, win_shift - (n - win_size) % win_shift)], dim=1) n = waveform.shape[1] # split frames into rows frame_num = (n - win_size) // win_shift + 1 strided_input = waveform.as_strided((m, frame_num, win_size), (n, win_shift, 1)) strided_input = strided_input - torch.mean(strided_input, dim=2).unsqueeze(2) # pre-emphasis preemphasis = 0.97 offset_strided_input = torch.nn.functional.pad(strided_input, (1, 0), mode='replicate') strided_input = strided_input - preemphasis * offset_strided_input[:, :, : -1] # windowed and FFT win_func = torch.hamming_window(win_size, periodic=False) windowed_input = strided_input * win_func fft = torch.rfft(windowed_input, 1, normalized=False, onesided=True) * 2 / win_size if drop_dc: fft = fft[:, :, 1:] fft = fft[:, :cut_len, :] power_spectrum = fft.pow(2).sum(3) log_power_spectrum = torch.log10(power_spectrum) * 10 # 对于能量谱正则化处理 mean_vec = log_power_spectrum.mean(axis=1, keepdim=True) std_vec = log_power_spectrum.std(axis=1, keepdim=True) log_power_spectrum = (log_power_spectrum - mean_vec) / std_vec # phase_spectrum = fft[:, :, :, 0] / fft.pow(2).sum(3).sqrt() phase_spectrum = torch.acos(phase_spectrum) phase_spectrum[fft[:, :, :, 0] < 0] = -phase_spectrum[fft[:, :, :, 0] < 0] spectrums = torch.cat([log_power_spectrum, phase_spectrum], dim=0) if not channel_first: spectrums = spectrums.permute(1, 2, 0) return spectrums
def __getitem__(self, index): np.random.seed() sample_name = self.sample_list[index] ############################################################### # 0. Batch settings ############################################################### if self.sample_count % self.batch_size == 0: self.sample_count = 0 # random global stretch & compression for the batch if self.global_stretch: self.frames = np.random.randint(self.min_stretch_frames, self.max_stretch_frames + 1) self.sample_count += 1 target = np.linspace( -1, 1, int(self.cfg["sample_duration"] * self.cfg["vid_framerate"])) ############################################################### # 1. Video data (keypoints) ############################################################### video_feature = scipy.io.loadmat('{0}/{1}.mat'.format( self.cfg["video_feature_root"], sample_name)) # grab keypoints (discard foot keypoints) video_feature = video_feature['lip_list'][0] video_feature = video_feature[:, 0:self.cfg["num_lip_keypoints"], :] # crop desired length from the video --- 300 frames for 12 seconds n_frames = video_feature.shape[0] frame_start = np.random.randint( 0, n_frames - self.cfg["sample_duration"] * self.cfg["vid_framerate"] + 1) frame_end = frame_start + self.cfg["sample_duration"] * self.cfg[ "vid_framerate"] video_feature = video_feature[frame_start:frame_end, :, :] # normalize keypoint positions for kp in range(self.cfg["num_lip_keypoints"]): scaler = MinMaxScaler(feature_range=(-1, 1)) video_feature[:, kp, :] = scaler.fit_transform(video_feature[:, kp, :]) #video_feature = video_feature.reshape(-1, self.cfg["num_lip_keypoints"]*2) # video augmentation --- horizontal flipping if self.mode == 'train': if np.random.rand() < self.cfg["prob_horizontal_flip"]: video_feature[:, :, 0] = video_feature[:, :, 0] * -1 ############################################################### # 2. Global Shift ############################################################### if self.global_shift: # shift the audio in feasible range left_shift = min(frame_start, self.max_shift_frames) right_shift = min(n_frames - frame_end, self.max_shift_frames) frame_shift = np.random.randint(-left_shift, right_shift + 1) audio_frame_start = frame_start + frame_shift # modify the target array target += 2 * frame_shift / (self.cfg["vid_framerate"] * self.cfg["sample_duration"]) else: audio_frame_start = frame_start ############################################################### # 3. Audio data (Spectrogram) ############################################################### waveform, audio_sample_rate = torchaudio.load('{0}/{1}.mp3'.format( self.cfg["audio_feature_root"], sample_name)) # crop 12s of audio data audio_start = int(audio_frame_start * self.cfg["mel_sr"] / self.cfg["vid_framerate"]) audio_end = int(audio_start + self.cfg["mel_sr"] * self.cfg["sample_duration"]) waveform = waveform[0, audio_start:audio_end] # compute spectrogram audio_feature = self.spec2db(self.calc_spec(waveform)) audio_feature = torch.squeeze(audio_feature, 0) # normalize spectrogram audio_feature = (audio_feature - torch.mean(audio_feature)) / ( torch.max(audio_feature) - torch.min(audio_feature)) # spectrogram augmentation if self.mode == 'train': # frequency masking if np.random.rand() < self.cfg["prob_freqmask"]: dur = np.random.randint(self.cfg["min_freqmask"], self.cfg["max_freqmask"] + 1) st = np.random.randint(0, audio_feature.shape[0] - dur + 1) audio_feature[st:st + dur, :] = 0 # time masking if np.random.rand() < self.cfg["prob_timemask"]: dur = np.random.randint(self.cfg["min_timemask"], self.cfg["max_timemask"] + 1) st = np.random.randint(0, audio_feature.shape[1] - dur + 1) audio_feature[:, st:st + dur] = 0 ############################################################### # 3. Global Stretch & Local Distortion ############################################################### new_video_feature = np.zeros( (self.frames, self.cfg["num_lip_keypoints"], 2)) new_target = np.zeros(self.frames) # Random distortion random_position = np.linspace(-1, 1, self.frames) if self.local_distortion: random_position = np.zeros(self.frames) while np.max( np.abs(random_position - np.linspace(-1, 1, self.frames)) ) > self.max_distortion_ratio: resample_len = int(self.frames * self.cfg["random_resample_rate"]) random_position = np.random.rand(resample_len) random_position = np.sort( (random_position - np.min(random_position)) * 2 / (np.max(random_position) - np.min(random_position)) - 1) f = interpolate.interp1d(np.linspace(-1, 1, resample_len), random_position, kind='linear') random_position = f(np.linspace(-1, 1, self.frames)) # Distorted & Stretched video feature for k in range(self.frames): orig_index = (random_position[k] + 1) / 2 * ( self.cfg["sample_duration"] * self.cfg["vid_framerate"] - 1) lower = int(np.floor(orig_index)) upper = int(np.ceil(orig_index)) if lower == upper: new_video_feature[k, :, :] = video_feature[lower, :, :] new_target[k] = target[lower] else: new_video_feature[k, :, :] = video_feature[lower, :, :] * ( upper - orig_index) + video_feature[upper, :, :] * ( orig_index - lower) new_target[k] = target[lower] * ( upper - orig_index) + target[upper] * (orig_index - lower) video_feature = new_video_feature target = new_target # velocity video_velo = np.zeros((self.frames, self.cfg["num_lip_keypoints"], 2)) video_velo[ 1:, :, :] = video_feature[1:, :, :] - video_feature[:-1, :, :] video_velo = video_velo / np.amax(np.absolute(video_velo)) # acceleration video_acc = np.zeros((self.frames, self.cfg["num_lip_keypoints"], 2)) video_acc[1:, :, :] = video_velo[1:, :, :] - video_velo[:-1, :, :] video_acc = video_acc / np.amax(np.absolute(video_acc)) # aggregate video_agg = np.zeros( (self.frames, self.cfg["num_lip_keypoints"], 2, 2)) video_agg[:, :, :, 0] = video_velo video_agg[:, :, :, 1] = video_acc return {"video_feature": torch.from_numpy(video_agg.astype(np.float32)), \ "audio_feature": audio_feature, \ "target": torch.from_numpy(target.astype(np.float32)), \ "sample_name": sample_name}
def extract_length(input_file): wav, _ = torchaudio.load(input_file) return wav.size(-1)
def torchaudio_load_file(file_path, normalization=True): data, sr = torchaudio.load(str(file_path)) return data.float(), sr
def load_audio(self,audiopath): self.waveform, self.sample_rate = torchaudio.load(audiopath) return self.waveform,self.sample_rate
def download(self): """Download the yesno data if it doesn't exist in processed_folder already.""" from six.moves import urllib import tarfile if self._check_exists(): return raw_abs_dir = os.path.join(self.root, self.raw_folder) processed_abs_dir = os.path.join(self.root, self.processed_folder) dset_abs_path = os.path.join( self.root, self.raw_folder, self.dset_path) # download files try: os.makedirs(os.path.join(self.root, self.raw_folder)) os.makedirs(os.path.join(self.root, self.processed_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise url = self.url print('Downloading ' + url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) if not os.path.isfile(file_path): urllib.request.urlretrieve(url, file_path) else: print("Tar file already downloaded") if not os.path.exists(dset_abs_path): with tarfile.open(file_path) as zip_f: zip_f.extractall(raw_abs_dir) else: print("Tar file already extracted") if not self.dev_mode: os.unlink(file_path) # process and save as torch files print('Processing...') shutil.copyfile( os.path.join(dset_abs_path, "README"), os.path.join(processed_abs_dir, "YESNO_README") ) audios = [x for x in os.listdir(dset_abs_path) if ".wav" in x] print("Found {} audio files".format(len(audios))) tensors = [] labels = [] lengths = [] for i, f in enumerate(audios): full_path = os.path.join(dset_abs_path, f) sig, sr = torchaudio.load(full_path) tensors.append(sig) lengths.append(sig.size(0)) labels.append(os.path.basename(f).split(".", 1)[0].split("_")) # sort sigs/labels: longest -> shortest tensors, labels = zip(*[(b, c) for (a, b, c) in sorted( zip(lengths, tensors, labels), key=lambda x: x[0], reverse=True)]) self.max_len = tensors[0].size(0) torch.save( (tensors, labels), os.path.join( self.root, self.processed_folder, self.processed_file ) ) if not self.dev_mode: shutil.rmtree(raw_abs_dir, ignore_errors=True) print('Done!')
def _load_wav(self, path): wav, _ = torchaudio.load(path_join(self.data_dir, path)) wav = self.resampler(wav).squeeze(0) return wav
def open(self, item, **kwargs): if isinstance(item, (Path, PosixPath, str)): sig, sr = torchaudio.load(item) return AudioItem(sig, sr, path=Path(item)) if isinstance(item, (tuple, np.ndarray)): return AudioItem(item)
# # To load audio data, you can use ``torchaudio.load``. # # This function accepts a path-like object or file-like object as input. # # The returned value is a tuple of waveform (``Tensor``) and sample rate # (``int``). # # By default, the resulting tensor object has ``dtype=torch.float32`` and # its value range is normalized within ``[-1.0, 1.0]``. # # For the list of supported format, please refer to `the torchaudio # documentation <https://pytorch.org/audio>`__. # waveform, sample_rate = torchaudio.load(SAMPLE_WAV_SPEECH_PATH) print_stats(waveform, sample_rate=sample_rate) plot_waveform(waveform, sample_rate) plot_specgram(waveform, sample_rate) play_audio(waveform, sample_rate) ###################################################################### # Loading from file-like object # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # ``torchaudio``\ ’s I/O functions now support file-like objects. This # allows for fetching and decoding audio data from locations # within and beyond the local file system. # The following examples illustrate this. #
def default_loader(path): audio, sr = torchaudio.load(path) return audio, sr
def create_gold_file(data_path, sample_rate): """ Create the following files: gold_units.json : contains gold_dicts, a list of mappings of {"sentence_id" : str, "units" : a list of ints representing phoneme id for each feature frame, "text" : a list of strs representing phoneme tokens for each feature frame} abx_triplets.item : contains ABX triplets in the format line 0 : whatever (not read) line > 0: #file_ID onset offset #phone prev-phone next-phone speaker onset : begining of the triplet (in s) offset : end of the triplet (in s) """ wav_scp_file = os.path.join(data_path, "mscoco2k_wav.scp") split_file = os.path.join(data_path, "mscoco2k_retrieval_split.txt") select_idxs = [ idx for idx, is_test in enumerate(open(split_file, 'r')) if int(is_test) ] phone_info_dict = json.load( open(os.path.join(data_path, "mscoco2k_phone_info.json"), "r")) phone_to_index = {} gold_dicts = [] triplets = ['#file_ID onset offset #phone prev-phone next-phone speaker'] # Extract audio file names as sentence ids with open(wav_scp_file, 'r') as wav_scp_f: filenames = [l.split()[-1] for idx, l in enumerate(wav_scp_f)] # Extract utterance duration durations = [ int(torchaudio.load(fn)[0].size(-1) * 1000 // (10 * sample_rate)) for fn in filenames ] # Extract phone mapping phone_path = os.path.join(data_path, "phone2id.json") if os.path.exists(phone_path): phone_to_index = json.load(open(phone_path, "r")) else: phones = set() for idx, (_, phone_info) in enumerate( sorted(phone_info_dict.items(), key=lambda x: int(x[0].split("_")[-1]))): for word_token in phone_info["data_ids"]: for phone_token in word_token[2]: token = phone_token[0] phones.update([token]) phone_to_index = {x: i for i, x in enumerate(sorted(phones))} phone_to_index[UNK] = len(phone_to_index) json.dump(phone_to_index, open(phone_path, "w"), indent=2) # Extract phone units phone_to_word_counts = collections.defaultdict(dict) global_idx = 0 for idx, (_, phone_info) in enumerate( sorted(phone_info_dict.items(), key=lambda x: int(x[0].split("_")[-1]))): if not idx in select_idxs: continue begin_word = 0 for word_info, word_token in zip(phone_info["data_ids"], phone_info["concepts"]): dur_word = word_info[2][-1][2] - word_info[2][0][1] end_word = begin_word + dur_word nframes = int(dur_word // 10) gold_dict = { "sentence_id": filenames[idx], "units": [-1] * nframes, "phoneme_text": [UNK] * nframes, "word_text": [word_token] * nframes, "interval": [begin_word, end_word] } begin_phone = 0 prefix = filenames.split('/')[-1] example_id = f"{prefix}_{global_idx}" global_idx += 1 for phn_idx, phone_token in enumerate(word_info[2]): if not word_token in phone_to_word_counts[phone_token[0]]: phone_to_word_counts[phone_token[0]][word_token] = 1 else: phone_to_word_counts[phone_token[0]][word_token] += 1 token, begin, end = phone_token[0], phone_token[ 1], phone_token[2] dur_phone = end - begin begin_frame = int(begin_phone // 10) end_frame = int((begin_phone + dur_phone) // 10) if (begin_word + begin_phone + dur_phone) // 10 > durations[idx]: print( 'In {}: end frame exceeds duration of audio, {} > {}'. format(filenames[idx], (begin_word + begin_phone + dur_phone) // 10, durations[idx])) break if phn_idx == 0: prev_token = NULL else: prev_token = word_info[2][phn_idx - 1][0] if phn_idx == len(word_info[2]) - 1: next_token = NULL else: next_token = word_info[2][phn_idx + 1][0] triplets.append( f'{example_id} {begin_phone / 1000.0:.4f} {(begin_phone + dur_phone)/ 1000.0:.4f} {token} {prev_token} {next_token} 0' ) for t in range(begin_frame, end_frame): gold_dict["units"][t] = phone_to_index[token] gold_dict["phoneme_text"][t] = token begin_phone += dur_phone if end_frame != nframes: gold_dict['phoneme_text'] = gold_dict[ 'phoneme_text'][:end_frame] gold_dict['word_text'] = gold_dict['word_text'][:end_frame] print('sentence_id, end_frame, nframes: ', filenames[idx], end_frame, nframes) gold_dicts.append(gold_dict) begin_word += dur_word with open(os.path.join(data_path, 'phone_token_top_10_words.txt'), 'w') as f: f.write('Phone\tWord\tCounts\n') for p in phone_to_word_counts: for w in sorted(phone_to_word_counts[p], key=lambda x: phone_to_word_counts[p][x], reverse=True): f.write('{}\t{}\t{}\n'.format(p, w, phone_to_word_counts[p][w])) with open(os.path.join(data_path, "gold_units.json"), "w") as gold_f: json.dump(gold_dicts, gold_f, indent=2) with open(os.path.join(data_path, "abx_triplets.item"), "w") as triplet_f: f.write('\n'.join(triplets))
def test_batch_pitch(self): waveform, sample_rate = torchaudio.load(self.test_filepath) self._test_batch(F.detect_pitch_frequency, waveform, sample_rate)
import torch import torchaudio import matplotlib.pyplot as plt torchaudio.set_audio_backend('soundfile') waveform, sample_rate = torchaudio.load('data/Clover.flac') print(f'Shape of waveform: {waveform.size()}') print(f'Sample rate of waveform: {sample_rate}') plt.figure() plt.plot(waveform.t().numpy()) plt.show()
def test_jit_pitch(self): waveform, sample_rate = torchaudio.load(self.test_filepath) _test_torchscript_functional(F.detect_pitch_frequency, waveform, sample_rate)
def test_save(self): # load signal x, sr = torchaudio.load(self.test_filepath) # check save new_filepath = os.path.join(self.test_dirpath, "test.wav") torchaudio.save(new_filepath, x, sr) self.assertTrue(os.path.isfile(new_filepath)) os.unlink(new_filepath) # check automatic normalization x /= 1 << 31 torchaudio.save(new_filepath, x, sr) self.assertTrue(os.path.isfile(new_filepath)) os.unlink(new_filepath) # test save 1d tensor x = x[:, 0] # get mono signal x.squeeze_() # remove channel dim torchaudio.save(new_filepath, x, sr) self.assertTrue(os.path.isfile(new_filepath)) os.unlink(new_filepath) # don't allow invalid sizes as inputs with self.assertRaises(ValueError): x.unsqueeze_(0) # N x L not L x N torchaudio.save(new_filepath, x, sr) with self.assertRaises(ValueError): x.squeeze_() x.unsqueeze_(1) x.unsqueeze_(0) # 1 x L x 1 torchaudio.save(new_filepath, x, sr) # automatically convert sr from floating point to int x.squeeze_(0) torchaudio.save(new_filepath, x, float(sr)) self.assertTrue(os.path.isfile(new_filepath)) os.unlink(new_filepath) # don't allow uneven integers with self.assertRaises(TypeError): torchaudio.save(new_filepath, x, float(sr) + 0.5) self.assertTrue(os.path.isfile(new_filepath)) os.unlink(new_filepath) # don't save to folders that don't exist with self.assertRaises(OSError): new_filepath = os.path.join(self.test_dirpath, "no-path", "test.wav") torchaudio.save(new_filepath, x, sr) # save created file sinewave_filepath = os.path.join(self.test_dirpath, "assets", "sinewave.wav") sr = 16000 freq = 440 volume = 0.3 y = (torch.cos(2 * math.pi * torch.arange(0, 4 * sr).float() * freq / sr)) y.unsqueeze_(1) # y is between -1 and 1, so must scale y = (y * volume * 2**31).long() torchaudio.save(sinewave_filepath, y, sr) self.assertTrue(os.path.isfile(sinewave_filepath)) # test precision new_filepath = os.path.join(self.test_dirpath, "test.wav") _, _, _, bp = torchaudio.info(sinewave_filepath) torchaudio.save(new_filepath, y, sr, precision=16) _, _, _, bp16 = torchaudio.info(new_filepath) self.assertEqual(bp, 32) self.assertEqual(bp16, 16) os.unlink(new_filepath)
class TestFunctional(unittest.TestCase): data_sizes = [(2, 20), (3, 15), (4, 10)] number_of_trials = 100 specgram = torch.tensor([1., 2., 3., 4.]) test_dirpath, test_dir = common_utils.create_temp_assets_dir() test_filepath = os.path.join(test_dirpath, 'assets', 'steam-train-whistle-daniel_simon.mp3') waveform_train, sr_train = torchaudio.load(test_filepath) def test_torchscript_spectrogram(self): tensor = torch.rand((1, 1000)) n_fft = 400 ws = 400 hop = 200 pad = 0 window = torch.hann_window(ws) power = 2 normalize = False _test_torchscript_functional( F.spectrogram, tensor, pad, window, n_fft, hop, ws, power, normalize ) def test_torchscript_griffinlim(self): tensor = torch.rand((1, 201, 6)) n_fft = 400 ws = 400 hop = 200 window = torch.hann_window(ws) power = 2 normalize = False momentum = 0.99 n_iter = 32 length = 1000 init = 0 _test_torchscript_functional( F.griffinlim, tensor, window, n_fft, hop, ws, power, normalize, n_iter, momentum, length, 0 ) def test_batch_griffinlim(self): torch.random.manual_seed(42) tensor = torch.rand((1, 201, 6)) n_fft = 400 ws = 400 hop = 200 window = torch.hann_window(ws) power = 2 normalize = False momentum = 0.99 n_iter = 32 length = 1000 self._test_batch( F.griffinlim, tensor, window, n_fft, hop, ws, power, normalize, n_iter, momentum, length, 0, atol=5e-5 ) def _test_compute_deltas(self, specgram, expected, win_length=3, atol=1e-6, rtol=1e-8): computed = F.compute_deltas(specgram, win_length=win_length) self.assertTrue(computed.shape == expected.shape, (computed.shape, expected.shape)) torch.testing.assert_allclose(computed, expected, atol=atol, rtol=rtol) def test_compute_deltas_onechannel(self): specgram = self.specgram.unsqueeze(0).unsqueeze(0) expected = torch.tensor([[[0.5, 1.0, 1.0, 0.5]]]) self._test_compute_deltas(specgram, expected) def test_compute_deltas_twochannel(self): specgram = self.specgram.repeat(1, 2, 1) expected = torch.tensor([[[0.5, 1.0, 1.0, 0.5], [0.5, 1.0, 1.0, 0.5]]]) self._test_compute_deltas(specgram, expected) def test_compute_deltas_randn(self): channel = 13 n_mfcc = channel * 3 time = 1021 win_length = 2 * 7 + 1 specgram = torch.randn(channel, n_mfcc, time) computed = F.compute_deltas(specgram, win_length=win_length) self.assertTrue(computed.shape == specgram.shape, (computed.shape, specgram.shape)) _test_torchscript_functional(F.compute_deltas, specgram, win_length=win_length) def test_batch_pitch(self): waveform, sample_rate = torchaudio.load(self.test_filepath) self._test_batch(F.detect_pitch_frequency, waveform, sample_rate) def test_jit_pitch(self): waveform, sample_rate = torchaudio.load(self.test_filepath) _test_torchscript_functional(F.detect_pitch_frequency, waveform, sample_rate) def _compare_estimate(self, sound, estimate, atol=1e-6, rtol=1e-8): # trim sound for case when constructed signal is shorter than original sound = sound[..., :estimate.size(-1)] self.assertTrue(sound.shape == estimate.shape, (sound.shape, estimate.shape)) self.assertTrue(torch.allclose(sound, estimate, atol=atol, rtol=rtol)) def _test_istft_is_inverse_of_stft(self, kwargs): # generates a random sound signal for each tril and then does the stft/istft # operation to check whether we can reconstruct signal for data_size in self.data_sizes: for i in range(self.number_of_trials): sound = common_utils.random_float_tensor(i, data_size) stft = torch.stft(sound, **kwargs) estimate = torchaudio.functional.istft(stft, length=sound.size(1), **kwargs) self._compare_estimate(sound, estimate) def test_istft_is_inverse_of_stft1(self): # hann_window, centered, normalized, onesided kwargs1 = { 'n_fft': 12, 'hop_length':4, 'win_length':12, 'window': torch.hann_window(12), 'center': True, 'pad_mode': 'reflect', 'normalized':True, 'onesided': True, } self._test_istft_is_inverse_of_stft(kwargs1) def test_istft_is_inverse_of_stft2(self): # hann_window, centered, not normalized, not onesided kwargs2 = { 'n_fft': 12, 'hop_length':2, 'win_length':8, 'window': torch.hann_window(8), 'center': True, 'pad_mode': 'reflect', 'normalized':False, 'onesided': False, } self._test_istft_is_inverse_of_stft(kwargs2) def test_istft_is_inverse_of_stft3(self): # hamming_window, centered, normalized, not onesided kwargs3 = { 'n_fft': 15, 'hop_length':3, 'win_length':11, 'window': torch.hamming_window(11), 'center': True, 'pad_mode': 'constant', 'normalized':True, 'onesided': False, } self._test_istft_is_inverse_of_stft(kwargs3) def test_istft_is_inverse_of_stft4(self): # hamming_window, not centered, not normalized, onesided # window same size as n_fft kwargs4 = { 'n_fft': 5, 'hop_length':2, 'win_length':5, 'window': torch.hamming_window(5), 'center': False, 'pad_mode': 'constant', 'normalized':False, 'onesided': True, } self._test_istft_is_inverse_of_stft(kwargs4) def test_istft_is_inverse_of_stft5(self): # hamming_window, not centered, not normalized, not onesided # window same size as n_fft kwargs5 = { 'n_fft': 3, 'hop_length':2, 'win_length':3, 'window': torch.hamming_window(3), 'center': False, 'pad_mode': 'reflect', 'normalized':False, 'onesided': False, } self._test_istft_is_inverse_of_stft(kwargs5) def test_istft_of_ones(self): # stft = torch.stft(torch.ones(4), 4) stft = torch.tensor([ [[4., 0.], [4., 0.], [4., 0.], [4., 0.], [4., 0.]], [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]], [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]] ]) estimate = torchaudio.functional.istft(stft, n_fft=4, length=4) self._compare_estimate(torch.ones(4), estimate) def test_istft_of_zeros(self): # stft = torch.stft(torch.zeros(4), 4) stft = torch.zeros((3, 5, 2)) estimate = torchaudio.functional.istft(stft, n_fft=4, length=4) self._compare_estimate(torch.zeros(4), estimate) def test_istft_requires_overlap_windows(self): # the window is size 1 but it hops 20 so there is a gap which throw an error stft = torch.zeros((3, 5, 2)) self.assertRaises(AssertionError, torchaudio.functional.istft, stft, n_fft=4, hop_length=20, win_length=1, window=torch.ones(1)) def test_istft_requires_nola(self): stft = torch.zeros((3, 5, 2)) kwargs_ok = { 'n_fft': 4, 'win_length':4, 'window': torch.ones(4), } kwargs_not_ok = { 'n_fft': 4, 'win_length':4, 'window': torch.zeros(4), } # A window of ones meets NOLA but a window of zeros does not. This should # throw an error. torchaudio.functional.istft(stft, **kwargs_ok) self.assertRaises(AssertionError, torchaudio.functional.istft, stft, **kwargs_not_ok) def test_istft_requires_non_empty(self): self.assertRaises(AssertionError, torchaudio.functional.istft, torch.zeros((3, 0, 2)), 2) self.assertRaises(AssertionError, torchaudio.functional.istft, torch.zeros((0, 3, 2)), 2) def _test_istft_of_sine(self, amplitude, L, n): # stft of amplitude*sin(2*pi/L*n*x) with the hop length and window size equaling L x = torch.arange(2 * L + 1, dtype=torch.get_default_dtype()) sound = amplitude * torch.sin(2 * math.pi / L * x * n) # stft = torch.stft(sound, L, hop_length=L, win_length=L, # window=torch.ones(L), center=False, normalized=False) stft = torch.zeros((L // 2 + 1, 2, 2)) stft_largest_val = (amplitude * L) / 2.0 if n < stft.size(0): stft[n, :, 1] = -stft_largest_val if 0 <= L - n < stft.size(0): # symmetric about L // 2 stft[L - n, :, 1] = stft_largest_val estimate = torchaudio.functional.istft(stft, L, hop_length=L, win_length=L, window=torch.ones(L), center=False, normalized=False) # There is a larger error due to the scaling of amplitude self._compare_estimate(sound, estimate, atol=1e-3) def test_istft_of_sine(self): self._test_istft_of_sine(amplitude=123, L=5, n=1) self._test_istft_of_sine(amplitude=150, L=5, n=2) self._test_istft_of_sine(amplitude=111, L=5, n=3) self._test_istft_of_sine(amplitude=160, L=7, n=4) self._test_istft_of_sine(amplitude=145, L=8, n=5) self._test_istft_of_sine(amplitude=80, L=9, n=6) self._test_istft_of_sine(amplitude=99, L=10, n=7) def _test_linearity_of_istft(self, data_size, kwargs, atol=1e-6, rtol=1e-8): for i in range(self.number_of_trials): tensor1 = common_utils.random_float_tensor(i, data_size) tensor2 = common_utils.random_float_tensor(i * 2, data_size) a, b = torch.rand(2) istft1 = torchaudio.functional.istft(tensor1, **kwargs) istft2 = torchaudio.functional.istft(tensor2, **kwargs) istft = a * istft1 + b * istft2 estimate = torchaudio.functional.istft(a * tensor1 + b * tensor2, **kwargs) self._compare_estimate(istft, estimate, atol, rtol) def test_linearity_of_istft1(self): # hann_window, centered, normalized, onesided kwargs1 = { 'n_fft': 12, 'window': torch.hann_window(12), 'center': True, 'pad_mode': 'reflect', 'normalized':True, 'onesided': True, } data_size = (2, 7, 7, 2) self._test_linearity_of_istft(data_size, kwargs1) def test_linearity_of_istft2(self): # hann_window, centered, not normalized, not onesided kwargs2 = { 'n_fft': 12, 'window': torch.hann_window(12), 'center': True, 'pad_mode': 'reflect', 'normalized':False, 'onesided': False, } data_size = (2, 12, 7, 2) self._test_linearity_of_istft(data_size, kwargs2) def test_linearity_of_istft3(self): # hamming_window, centered, normalized, not onesided kwargs3 = { 'n_fft': 12, 'window': torch.hamming_window(12), 'center': True, 'pad_mode': 'constant', 'normalized':True, 'onesided': False, } data_size = (2, 12, 7, 2) self._test_linearity_of_istft(data_size, kwargs3) def test_linearity_of_istft4(self): # hamming_window, not centered, not normalized, onesided kwargs4 = { 'n_fft': 12, 'window': torch.hamming_window(12), 'center': False, 'pad_mode': 'constant', 'normalized':False, 'onesided': True, } data_size = (2, 7, 3, 2) self._test_linearity_of_istft(data_size, kwargs4, atol=1e-5, rtol=1e-8) def test_batch_istft(self): stft = torch.tensor([ [[4., 0.], [4., 0.], [4., 0.], [4., 0.], [4., 0.]], [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]], [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]] ]) self._test_batch(F.istft, stft, n_fft=4, length=4) def _test_create_fb(self, n_mels=40, sample_rate=22050, n_fft=2048, fmin=0.0, fmax=8000.0): librosa_fb = librosa.filters.mel(sr=sample_rate, n_fft=n_fft, n_mels=n_mels, fmax=fmax, fmin=fmin, htk=True, norm=None) fb = F.create_fb_matrix(sample_rate=sample_rate, n_mels=n_mels, f_max=fmax, f_min=fmin, n_freqs=(n_fft // 2 + 1)) for i_mel_bank in range(n_mels): assert torch.allclose(fb[:, i_mel_bank], torch.tensor(librosa_fb[i_mel_bank]), atol=1e-4) def test_create_fb(self): self._test_create_fb() self._test_create_fb(n_mels=128, sample_rate=44100) self._test_create_fb(n_mels=128, fmin=2000.0, fmax=5000.0) self._test_create_fb(n_mels=56, fmin=100.0, fmax=9000.0) self._test_create_fb(n_mels=56, fmin=800.0, fmax=900.0) self._test_create_fb(n_mels=56, fmin=1900.0, fmax=900.0) self._test_create_fb(n_mels=10, fmin=1900.0, fmax=900.0) def test_gain(self): waveform_gain = F.gain(self.waveform_train, 3) self.assertTrue(waveform_gain.abs().max().item(), 1.) E = torchaudio.sox_effects.SoxEffectsChain() E.set_input_file(self.test_filepath) E.append_effect_to_chain("gain", [3]) sox_gain_waveform = E.sox_build_flow_effects()[0] self.assertTrue(torch.allclose(waveform_gain, sox_gain_waveform, atol=1e-04)) def test_dither(self): waveform_dithered = F.dither(self.waveform_train) waveform_dithered_noiseshaped = F.dither(self.waveform_train, noise_shaping=True) E = torchaudio.sox_effects.SoxEffectsChain() E.set_input_file(self.test_filepath) E.append_effect_to_chain("dither", []) sox_dither_waveform = E.sox_build_flow_effects()[0] self.assertTrue(torch.allclose(waveform_dithered, sox_dither_waveform, atol=1e-04)) E.clear_chain() E.append_effect_to_chain("dither", ["-s"]) sox_dither_waveform_ns = E.sox_build_flow_effects()[0] self.assertTrue(torch.allclose(waveform_dithered_noiseshaped, sox_dither_waveform_ns, atol=1e-02)) def test_vctk_transform_pipeline(self): test_filepath_vctk = os.path.join(self.test_dirpath, "assets/VCTK-Corpus/wav48/p224/", "p224_002.wav") wf_vctk, sr_vctk = torchaudio.load(test_filepath_vctk) # rate sample = T.Resample(sr_vctk, 16000, resampling_method='sinc_interpolation') wf_vctk = sample(wf_vctk) # dither wf_vctk = F.dither(wf_vctk, noise_shaping=True) E = torchaudio.sox_effects.SoxEffectsChain() E.set_input_file(test_filepath_vctk) E.append_effect_to_chain("gain", ["-h"]) E.append_effect_to_chain("channels", [1]) E.append_effect_to_chain("rate", [16000]) E.append_effect_to_chain("gain", ["-rh"]) E.append_effect_to_chain("dither", ["-s"]) wf_vctk_sox = E.sox_build_flow_effects()[0] self.assertTrue(torch.allclose(wf_vctk, wf_vctk_sox, rtol=1e-03, atol=1e-03)) def test_pitch(self): test_dirpath, test_dir = common_utils.create_temp_assets_dir() test_filepath_100 = os.path.join(test_dirpath, 'assets', "100Hz_44100Hz_16bit_05sec.wav") test_filepath_440 = os.path.join(test_dirpath, 'assets', "440Hz_44100Hz_16bit_05sec.wav") # Files from https://www.mediacollege.com/audio/tone/download/ tests = [ (test_filepath_100, 100), (test_filepath_440, 440), ] for filename, freq_ref in tests: waveform, sample_rate = torchaudio.load(filename) freq = torchaudio.functional.detect_pitch_frequency(waveform, sample_rate) threshold = 1 s = ((freq - freq_ref).abs() > threshold).sum() self.assertFalse(s) # Convert to stereo and batch for testing purposes self._test_batch(F.detect_pitch_frequency, waveform, sample_rate) def _test_batch_shape(self, functional, tensor, *args, **kwargs): kwargs_compare = {} if 'atol' in kwargs: atol = kwargs['atol'] del kwargs['atol'] kwargs_compare['atol'] = atol if 'rtol' in kwargs: rtol = kwargs['rtol'] del kwargs['rtol'] kwargs_compare['rtol'] = rtol # Single then transform then batch torch.random.manual_seed(42) expected = functional(tensor.clone(), *args, **kwargs) expected = expected.unsqueeze(0).unsqueeze(0) # 1-Batch then transform tensors = tensor.unsqueeze(0).unsqueeze(0) torch.random.manual_seed(42) computed = functional(tensors.clone(), *args, **kwargs) self._compare_estimate(computed, expected, **kwargs_compare) return tensors, expected def _test_batch(self, functional, tensor, *args, **kwargs): tensors, expected = self._test_batch_shape(functional, tensor, *args, **kwargs) kwargs_compare = {} if 'atol' in kwargs: atol = kwargs['atol'] del kwargs['atol'] kwargs_compare['atol'] = atol if 'rtol' in kwargs: rtol = kwargs['rtol'] del kwargs['rtol'] kwargs_compare['rtol'] = rtol # 3-Batch then transform ind = [3] + [1] * (int(tensors.dim()) - 1) tensors = tensor.repeat(*ind) ind = [3] + [1] * (int(expected.dim()) - 1) expected = expected.repeat(*ind) torch.random.manual_seed(42) computed = functional(tensors.clone(), *args, **kwargs) def test_torchscript_create_fb_matrix(self): n_stft = 100 f_min = 0.0 f_max = 20.0 n_mels = 10 sample_rate = 16000 _test_torchscript_functional(F.create_fb_matrix, n_stft, f_min, f_max, n_mels, sample_rate) def test_torchscript_amplitude_to_DB(self): spec = torch.rand((6, 201)) multiplier = 10.0 amin = 1e-10 db_multiplier = 0.0 top_db = 80.0 _test_torchscript_functional(F.amplitude_to_DB, spec, multiplier, amin, db_multiplier, top_db) def test_torchscript_DB_to_amplitude(self): x = torch.rand((1, 100)) ref = 1. power = 1. _test_torchscript_functional(F.DB_to_amplitude, x, ref, power) def test_DB_to_amplitude(self): # Make some noise x = torch.rand(1000) spectrogram = torchaudio.transforms.Spectrogram() spec = spectrogram(x) amin = 1e-10 ref = 1.0 db_multiplier = math.log10(max(amin, ref)) # Waveform amplitude -> DB -> amplitude multiplier = 20. power = 0.5 db = F.amplitude_to_DB(torch.abs(x), multiplier, amin, db_multiplier, top_db=None) x2 = F.DB_to_amplitude(db, ref, power) self.assertTrue(torch.allclose(torch.abs(x), x2, atol=5e-5)) # Spectrogram amplitude -> DB -> amplitude db = F.amplitude_to_DB(spec, multiplier, amin, db_multiplier, top_db=None) x2 = F.DB_to_amplitude(db, ref, power) self.assertTrue(torch.allclose(spec, x2, atol=5e-5)) # Waveform power -> DB -> power multiplier = 10. power = 1. db = F.amplitude_to_DB(x, multiplier, amin, db_multiplier, top_db=None) x2 = F.DB_to_amplitude(db, ref, power) self.assertTrue(torch.allclose(torch.abs(x), x2, atol=5e-5)) # Spectrogram power -> DB -> power db = F.amplitude_to_DB(spec, multiplier, amin, db_multiplier, top_db=None) x2 = F.DB_to_amplitude(db, ref, power) self.assertTrue(torch.allclose(spec, x2, atol=5e-5)) def test_torchscript_create_dct(self): n_mfcc = 40 n_mels = 128 norm = "ortho" _test_torchscript_functional(F.create_dct, n_mfcc, n_mels, norm) def test_torchscript_mu_law_encoding(self): tensor = torch.rand((1, 10)) qc = 256 _test_torchscript_functional(F.mu_law_encoding, tensor, qc) def test_torchscript_mu_law_decoding(self): tensor = torch.rand((1, 10)) qc = 256 _test_torchscript_functional(F.mu_law_decoding, tensor, qc) def test_torchscript_complex_norm(self): complex_tensor = torch.randn(1, 2, 1025, 400, 2) power = 2 _test_torchscript_functional(F.complex_norm, complex_tensor, power) def test_mask_along_axis(self): specgram = torch.randn(2, 1025, 400) mask_param = 100 mask_value = 30. axis = 2 _test_torchscript_functional(F.mask_along_axis, specgram, mask_param, mask_value, axis) def test_mask_along_axis_iid(self): specgrams = torch.randn(4, 2, 1025, 400) mask_param = 100 mask_value = 30. axis = 2 _test_torchscript_functional(F.mask_along_axis_iid, specgrams, mask_param, mask_value, axis) def test_torchscript_gain(self): tensor = torch.rand((1, 1000)) gainDB = 2.0 _test_torchscript_functional(F.gain, tensor, gainDB) def test_torchscript_dither(self): tensor = torch.rand((2, 1000)) _test_torchscript_functional_shape(F.dither, tensor) _test_torchscript_functional_shape(F.dither, tensor, "RPDF") _test_torchscript_functional_shape(F.dither, tensor, "GPDF")
def test_Vad(self): filepath = common_utils.get_asset_path("vad-go-mono-32000.wav") waveform, sample_rate = torchaudio.load(filepath) self._assert_consistency(T.Vad(sample_rate=sample_rate), waveform)
def load_wav(self, file): waveform, sample_rate = torchaudio.load(self.wav_folder + file) if sample_rate != self.sr: raise (AssertionError) return waveform, sample_rate
def load_audio(path): waveform, sample_rate = torchaudio.load(path) return waveform
f_info[0] : speaker ID f_info[1] : session number f_info[2] : sentence number f_info[3] : extension """ print("===extract features to .npy===") for gender in ["male", "female"]: SUBDIR_PATH = DIR_PATH / gender for f in SUBDIR_PATH.rglob('*.wav'): f_info = re.split('[_.]', f.name) if int(f_info[2]) > 30: continue waveform, sample_rate = torchaudio.load(SUBDIR_PATH / f_info[0] / f.name, normalization=True) feature = mfcc(waveform, sample_rate, winfunc=np.hamming, numcep=hyper_params["numcep"], nfilt=26, nfft=512, preemph=0.97) speaker = str(f_info[0]) if speaker in train_speaker_list: f_path = FEATURE_PATH / "train" / f.name[:-4] elif speaker in val_speaker_list:
from pase.models.frontend import wf_builder pase = wf_builder('cfg/frontend/PASE+.cfg').eval() pase.load_pretrained('FE_e199.ckpt', load_last=True, verbose=True) pase.cuda() import sys wav_path = sys.argv[1] out_path = sys.argv[2] # Now we can forward waveforms as Torch tensors import torch import torchaudio torchaudio.set_audio_backend('sox') x, sr = torchaudio.load(wav_path) x = x.view(-1).cuda() x = x.view(1, 1, -1) with torch.no_grad(): pase.eval() # y size will be (1, 256, 625), which are 625 frames of 256 dims each y = pase(x)[0].transpose(0, 1) torch.save(y.detach().cpu(), out_path)
#build dataset dataset_path = "D:/songs_headphones/stereo-cut-16k" n_files = 0 fs = 0 pad = nn.ConstantPad1d((0, 1), 0) start_time = time.time() for subdir, dirs, files in os.walk(dataset_path): for file in files: filepath = subdir + os.sep + file if filepath.endswith(".wav"): if n_files == 0: dataset, fs = torchaudio.load(filepath) dataset = pad(dataset) dataset = dataset / torch.max(dataset) elif n_files == 1: new_file, fs = torchaudio.load(filepath) new_file = pad(new_file) new_file = new_file / torch.max(new_file) dataset = torch.stack((dataset, new_file)) else: new_file, fs = torchaudio.load(filepath) new_file = pad(new_file) new_file = new_file / torch.max(new_file) dataset = torch.cat((dataset, new_file.unsqueeze(0)))
padding=False, num_padding=0) wav_file = "/d1/jbaik/ics-asr/temp/conan1-8k.wav" audio = transformer(wav_file) # test Spectrogram elif test == 2: import matplotlib matplotlib.use('TkAgg') matplotlib.interactive(True) import matplotlib.pyplot as plt nperseg = int(p.SAMPLE_RATE * p.WINDOW_SIZE) noverlap = int(p.SAMPLE_RATE * (p.WINDOW_SIZE - p.WINDOW_SHIFT)) wav_file = Path("../data/aspire/000/fe_03_00047-A-025005-025135.wav") audio, _ = torchaudio.load(wav_file) # pyplot specgram audio = torch.squeeze(audio) fig = plt.figure(0) plt.specgram(audio, Fs=p.SAMPLE_RATE, NFFT=p.NFFT, noverlap=noverlap, cmap='plasma') # implemented transformer - scipy stft transformer = Spectrogram(sample_rate=p.SAMPLE_RATE, window_stride=p.WINDOW_SHIFT, window_size=p.WINDOW_SIZE, nfft=p.NFFT)
def read_audio_file(path, src_dir, side, sample_rate, window_size, window_stride, window, normalize_audio, truncate=None): """ Args: path (str): location of a src file containing audio paths. src_dir (str): location of source audio files. side (str): 'src' or 'tgt'. sample_rate (int): sample_rate. window_size (float) : window size for spectrogram in seconds. window_stride (float): window stride for spectrogram in seconds. window (str): window type for spectrogram generation. normalize_audio (bool): subtract spectrogram by mean and divide by std or not. truncate (int): maximum audio length (0 or None for unlimited). Yields: a dictionary containing audio data for each line. """ assert (src_dir is not None) and os.path.exists(src_dir),\ "src_dir must be a valid directory if data_type is audio" global torchaudio, librosa, np import torchaudio import librosa import numpy as np with codecs.open(path, "r", "utf-8") as corpus_file: index = 0 for line in corpus_file: audio_path = os.path.join(src_dir, line.strip()) if not os.path.exists(audio_path): audio_path = line assert os.path.exists(audio_path), \ 'audio path %s not found' % (line.strip()) sound, sample_rate = torchaudio.load(audio_path) if truncate and truncate > 0: if sound.size(0) > truncate: continue assert sample_rate == sample_rate, \ 'Sample rate of %s != -sample_rate (%d vs %d)' \ % (audio_path, sample_rate, sample_rate) sound = sound.numpy() if len(sound.shape) > 1: if sound.shape[1] == 1: sound = sound.squeeze() else: sound = sound.mean(axis=1) # average multiple channels n_fft = int(sample_rate * window_size) win_length = n_fft hop_length = int(sample_rate * window_stride) # STFT d = librosa.stft(sound, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window) spect, _ = librosa.magphase(d) spect = np.log1p(spect) spect = torch.FloatTensor(spect) if normalize_audio: mean = spect.mean() std = spect.std() spect.add_(-mean) spect.div_(std) example_dict = {side: spect, side + '_path': line.strip(), 'indices': index} index += 1 yield example_dict