def __getitem__(self, idx): y_label, x1_path, x2_path = self.dataset[idx] wav1, _ = apply_effects_file(x1_path, EFFECTS) wav2, _ = apply_effects_file(x2_path, EFFECTS) wav1 = wav1.squeeze(0) wav2 = wav2.squeeze(0) length1 = wav1.shape[0] length2 = wav2.shape[0] return wav1.numpy(), wav2.numpy(), length1, length2, int(y_label[0])
def __getitem__(self, idx): y_label, x1_path, x2_path = self.dataset[idx] def path2name(path): return Path("-".join((Path(path).parts)[-3:])).stem x1_name = path2name(x1_path) x2_name = path2name(x2_path) wav1, _ = apply_effects_file(x1_path, EFFECTS) wav2, _ = apply_effects_file(x2_path, EFFECTS) wav1 = wav1.squeeze(0) wav2 = wav2.squeeze(0) return wav1.numpy(), wav2.numpy(), x1_name, x2_name, int(y_label[0])
def main(): args = docopt(__doc__) input_sqlite = args["--input_sqlite"] job_id = args["--job_id"] if not os.path.exists(job_id): os.mkdir(job_id) conn = sqlite3.connect(input_sqlite) cur = conn.cursor() cur.execute("SELECT * FROM data") rows = cur.fetchall() input_df = [] num = 0 for i, row in tqdm(enumerate(rows)): try: audio_url = json.loads(row[1])["audio_url"] intent_label = json.loads(row[2])[0]["type"] path = audio_url_to_file(audio_url, job_id) path = flac2wav(path) wav, _ = apply_effects_file(str(path), EFFECTS) base_path = "scripts/" input_df.append([base_path + path, intent_label]) except: num += 1 print(i) pass input_df = pd.DataFrame(input_df, columns=["audio_path", "label"]) input_df.to_csv(f"{job_id}.csv", index=False) print("total audios skipped :", num)
def segment_processing(self): wav_list = self.pair_dict['wav_table'] utterance_id = 0 segment_list = [] print("processing test set to segments") for wav_info in tqdm.tqdm(wav_list): label_info = wav_info[0] pair_info = wav_info[1] wav, _ = apply_effects_file(wav_info[2], EFFECTS) wav = wav.squeeze(0) index_end = len(wav) - self.segment_config["window"] segment_num = index_end // self.segment_config['stride'] if index_end < 0: segment_list.append([ int(label_info), pair_info, str(utterance_id), segment_num, 0, len(wav), wav_info[2] ]) else: for index in range(0, index_end, self.segment_config['stride']): segment_list.append([ int(label_info), pair_info, str(utterance_id), segment_num, index, index + self.segment_config['window'], wav_info[2] ]) utterance_id += 1 return segment_list
def test_requests(self, ext, compression): sample_rate = 16000 channels_first = True effects = [['band', '300', '10']] format_ = ext if ext in ['mp3'] else None audio_file = f'input.{ext}' input_path = self.get_temp_path(audio_file) reference_path = self.get_temp_path('reference.wav') sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression) sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32) expected, expected_sr = load_wav(reference_path) url = self.get_url(audio_file) with requests.get(url, stream=True) as resp: found, sr = sox_effects.apply_effects_file( resp.raw, effects, channels_first=channels_first, format=format_) save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first) assert sr == expected_sr self.assertEqual(found, expected)
def _load_wav_with_speed(wav_file, speed): """ Load the wave from file and apply speed perpturbation Args: wav_file: input feature, T * F 2D Returns: augmented feature """ if speed == 1.0: wav, sr = torchaudio.load(wav_file) else: sample_rate = torchaudio.backend.sox_io_backend.info( wav_file).sample_rate # get torchaudio version ta_no = torchaudio.__version__.split(".") ta_version = 100 * int(ta_no[0]) + 10 * int(ta_no[1]) if ta_version < 80: # Note: deprecated in torchaudio>=0.8.0 E = sox_effects.SoxEffectsChain() E.append_effect_to_chain('speed', speed) E.append_effect_to_chain("rate", sample_rate) E.set_input_file(wav_file) wav, sr = E.sox_build_flow_effects() else: # Note: enable in torchaudio>=0.8.0 wav, sr = sox_effects.apply_effects_file( wav_file, [['speed', str(speed)], ['rate', str(sample_rate)]]) return wav, sr
def main(): args = docopt(__doc__) input_json = args["--input_json"] audio_dir = args["--audio_dir"] if os.path.exists(audio_dir): os.remove(audio_dir) os.mkdir(audio_dir) with open(input_json,'r') as f: rows = json.load(f) input_df = [] num = 0 for i,row in tqdm(enumerate(rows)): audio_url = row["data"]["audio_url"] print(audio_url) intent_label = row["intent"] alternatives = "</s></s>".join([x['transcript'] for x in row["data"]["alternatives"][0]]) path = audio_url_to_file(audio_url,audio_dir) path = flac2wav(path) wav, _ = apply_effects_file(str(path), EFFECTS) base_path = "scripts/" input_df.append([base_path+path,intent_label,alternatives]) ''' except: num += 1 print(i) pass ''' input_df = pd.DataFrame(input_df,columns=["audio_path","label","alternatives"]) input_df.to_csv(f"{audio_dir}.csv", index=False) print("total audios skipped :", num)
def find_queries(query_dir_path): """Find all queries under sws2013_dev & sws2013_eval.""" # e.g. "sws2013_dev_123.wav" or "sws2013_dev_123_01.wav" -> "sws2013_dev_123" pattern = re.compile(r"(_[0-9]{2})?\.wav") query2tensors = defaultdict(list) for query_path in tqdm(list(query_dir_path.glob("*.wav")), ncols=0, desc="Load queries"): query_name = pattern.sub("", query_path.name) wav_tensor, sample_rate = apply_effects_file( str(query_path), [["channels", "1"], ["rate", "16000"], ["norm"]]) trimmed, _ = apply_effects_tensor( wav_tensor, sample_rate, [ ["vad", "-T", "0.25", "-p", "0.1"], ["reverse"], ["vad", "-T", "0.25", "-p", "0.1"], ["reverse"], ], ) wav_tensor = trimmed if trimmed.size(1) >= (sample_rate * 0.5) else wav_tensor wav_tensor = wav_tensor.squeeze(0) query2tensors[query_name].append(wav_tensor) return query2tensors
def test_tarfile(self, ext, compression): """Applying effects to compressed audio via file-like file works""" sample_rate = 16000 channels_first = True effects = [['band', '300', '10']] format_ = ext if ext in ['mp3'] else None audio_file = f'input.{ext}' input_path = self.get_temp_path(audio_file) reference_path = self.get_temp_path('reference.wav') archive_path = self.get_temp_path('archive.tar.gz') sox_utils.gen_audio_file( input_path, sample_rate, num_channels=2, compression=compression) sox_utils.run_sox_effect( input_path, reference_path, effects, output_bitdepth=32) expected, expected_sr = load_wav(reference_path) with tarfile.TarFile(archive_path, 'w') as tarobj: tarobj.add(input_path, arcname=audio_file) with tarfile.TarFile(archive_path, 'r') as tarobj: fileobj = tarobj.extractfile(audio_file) found, sr = sox_effects.apply_effects_file( fileobj, effects, channels_first=channels_first, format=format_) save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first) assert sr == expected_sr self.assertEqual(found, expected)
def test_apply_effects_file(self, args): effects = args['effects'] channels_first = True num_channels = args.get("num_channels", 2) input_sr = args.get("input_sample_rate", 8000) trans = SoxEffectFileTransform(effects, channels_first) path = self.get_temp_path('sox_effect.zip') torch.jit.script(trans).save(path) trans = torch.jit.load(path) path = self.get_temp_path('input.wav') wav = get_sinusoid(frequency=800, sample_rate=input_sr, n_channels=num_channels, dtype='float32', channels_first=channels_first) save_wav(path, wav, sample_rate=input_sr, channels_first=channels_first) found, sr_found = trans(path) expected, sr_expected = sox_effects.apply_effects_file( path, effects, channels_first) assert sr_found == sr_expected self.assertEqual(expected, found)
def test_apply_effects_path(self): """`apply_effects_file` should return identical data as sox command when file path is given as a Path Object""" dtype = 'int32' channels_first = True effects = [["hilbert"]] num_channels = 2 input_sr = 8000 output_sr = 8000 input_path = self.get_temp_path('input.wav') reference_path = self.get_temp_path('reference.wav') data = get_wav_data(dtype, num_channels, channels_first=channels_first) save_wav(input_path, data, input_sr, channels_first=channels_first) sox_utils.run_sox_effect(input_path, reference_path, effects, output_sample_rate=output_sr) expected, expected_sr = load_wav(reference_path) found, sr = sox_effects.apply_effects_file( Path(input_path), effects, normalize=False, channels_first=channels_first) assert sr == expected_sr self.assertEqual(found, expected)
def test_apply_effects_str(self, args): """`apply_effects_file` should return identical data as sox command""" dtype = 'int32' channels_first = True effects = args['effects'] num_channels = args.get("num_channels", 2) input_sr = args.get("input_sample_rate", 8000) output_sr = args.get("output_sample_rate") input_path = self.get_temp_path('input.wav') reference_path = self.get_temp_path('reference.wav') data = get_wav_data(dtype, num_channels, channels_first=channels_first) save_wav(input_path, data, input_sr, channels_first=channels_first) sox_utils.run_sox_effect(input_path, reference_path, effects, output_sample_rate=output_sr) expected, expected_sr = load_wav(reference_path) found, sr = sox_effects.apply_effects_file( input_path, effects, normalize=False, channels_first=channels_first) assert sr == expected_sr self.assertEqual(found, expected)
def _load_wav_with_speed(wav_file, speed): """ Load the wave from file and apply speed perpturbation Args: wav_file: input feature, T * F 2D Returns: augmented feature """ if speed == 1.0: return torchaudio.load_wav(wav_file) else: si, _ = torchaudio.info(wav_file) # get torchaudio version ta_no = torchaudio.__version__.split(".") ta_version = 100 * int(ta_no[0]) + 10 * int(ta_no[1]) if ta_version < 80: # Note: deprecated in torchaudio>=0.8.0 E = sox_effects.SoxEffectsChain() E.append_effect_to_chain('speed', speed) E.append_effect_to_chain("rate", si.rate) E.set_input_file(wav_file) wav, sr = E.sox_build_flow_effects() else: # Note: enable in torchaudio>=0.8.0 wav, sr = sox_effects.apply_effects_file( wav_file, [['speed', str(speed)], ['rate', str(si.rate)]]) # sox will normalize the waveform, scale to [-32768, 32767] wav = wav * (1 << 15) return wav, sr
def test_bytesio(self, ext, compression): """Applying effects via BytesIO object works""" sample_rate = 16000 channels_first = True effects = [['band', '300', '10']] format_ = ext if ext in ['mp3'] else None input_path = self.get_temp_path(f'input.{ext}') reference_path = self.get_temp_path('reference.wav') sox_utils.gen_audio_file(input_path, sample_rate, num_channels=2, compression=compression) sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32) expected, expected_sr = load_wav(reference_path) with open(input_path, 'rb') as file_: fileobj = io.BytesIO(file_.read()) found, sr = sox_effects.apply_effects_file( fileobj, effects, channels_first=channels_first, format=format_) save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first) assert sr == expected_sr self.assertEqual(found, expected)
def __init__(self, vad_config, key_list, file_path, meta_data, max_timestep=None): self.roots = file_path self.root_key = key_list self.max_timestep = max_timestep self.vad_c = vad_config self.dataset = [] self.all_speakers = [] for index in range(len(self.root_key)): cache_path = Path(os.path.dirname(__file__)) / 'cache_wav_paths' / f'cache_{self.root_key[index]}.p' p = Path(self.roots[index]) # loca cache_path if file exists if os.path.isfile(cache_path): # cache dict: # { # "speaker_id1": ["wav_a_path1", "wav_a_path2", ...], # "speaker_id2": ["wav_b_path1", "wav_b_path2", ...], # ..., # } cache_wavs_dict = pickle.load(open(cache_path,"rb")) self.all_speakers.extend(list(cache_wavs_dict.keys())) for speaker_id in list(cache_wavs_dict.keys()): for wavs in cache_wavs_dict[speaker_id]: utterance_id = "/".join(str(p/speaker_id/wavs).split("/")[-3:]).replace(".wav","").replace("/","-") self.dataset.append([str(p / speaker_id / wavs), utterance_id]) else: speaker_wav_dict = {} speaker_dirs = [f.path.split("/")[-1] for f in os.scandir(self.roots[index]) if f.is_dir()] self.all_speakers.extend(speaker_dirs) print("search all wavs paths") start = time.time() for speaker in tqdm.tqdm(speaker_dirs): speaker_dir = p / speaker wav_list=find_files(speaker_dir) speaker_wav_dict[speaker] = [] for wav in wav_list: wav_sample, _ = apply_effects_file(str(speaker_dir/wav), EFFECTS) wav_sample = wav_sample.squeeze(0) length = wav_sample.shape[0] if length > self.vad_c['min_sec']: utterance_id = "/".join(str(speaker_dir/wav).split("/")[-3:]).replace(".wav","").replace("/","-") self.dataset.append([str(speaker_dir/wav), utterance_id]) speaker_wav_dict[speaker].append("/".join(wav.split("/")[-2:])) end = time.time() print(f"search all wavs paths costs {end-start} seconds") print(f"save wav paths to {cache_path}! so we can directly load all_path in next time!") pickle.dump(speaker_wav_dict, open(cache_path,"wb")) self.speaker_num = len(self.all_speakers) self.necessary_dict = self.processing() self.label_mapping_spk_id = {} # speaker id map to speaker num self.build_label_mapping() self.label=self.build_label(self.dataset)
def path2tensor(filepath): tensor, _ = apply_effects_file( str(filepath), [ ["channels", "1"], ["rate", "16000"], ["norm"], ], ) return tensor.squeeze(0)
def __getitem__(self, idx): wav, _ = apply_effects_file(self.dataset[idx][0], EFFECTS) wav = wav.squeeze(0) length = wav.shape[0] if self.max_timestep != None: if length > self.max_timestep: start = random.randint(0, int(length - self.max_timestep)) wav = wav[start:start + self.max_timestep] return wav.numpy(), self.dataset[idx][1], self.label[idx]
def test_apply_no_effect(self, dtype, sample_rate, num_channels, channels_first): """`apply_effects_file` without effects should return identical data as input""" path = self.get_temp_path('input.wav') expected = get_wav_data(dtype, num_channels, channels_first=channels_first) save_wav(path, expected, sample_rate, channels_first=channels_first) found, output_sample_rate = sox_effects.apply_effects_file( path, [], normalize=False, channels_first=channels_first) assert output_sample_rate == sample_rate self.assertEqual(expected, found)
def __getitem__(self, idx): wav, _ = apply_effects_file(str(self.data_list[idx][0]), EFFECTS) wav = wav.squeeze(0) length = wav.shape[0] if self.max_timestep is not None: if length > self.max_timestep: start = random.randint(0, int(length - self.max_timestep)) wav = wav[start:start + self.max_timestep] length = self.max_timestep return wav, self.data_list[idx][1]
def __getitem__(self, idx): audio_path = self.data[idx] wav, _ = apply_effects_file( str(audio_path), [ ["channels", "1"], ["rate", "16000"], ["gain", "-3.0"], ], ) wav = wav.squeeze(0) return wav.numpy(), audio_path.with_suffix("").name
def test_mp3(self): """Providing format allows to read mp3 without extension libsox does not check header for mp3 https://github.com/pytorch/audio/issues/1040 The file was generated with the following command ffmpeg -f lavfi -i "sine=frequency=1000:duration=5" -ar 16000 -f mp3 test_noext """ effects = [['band', '300', '10']] path = get_asset_path("mp3_without_ext") _, sr = sox_effects.apply_effects_file(path, effects, format="mp3") assert sr == 16000
def test_apply_effects_file(self, args): """`apply_effects_file` should return identical data as sox command""" dtype = 'int32' channels_first = True effects = args['effects'] num_channels = args.get("num_channels", 2) input_sr = args.get("input_sample_rate", 8000) input_path = self.get_temp_path('input.wav') data = get_wav_data(dtype, num_channels, channels_first=channels_first) save_wav(input_path, data, input_sr, channels_first=channels_first) _found, _sr = sox_effects.apply_effects_file( input_path, effects, normalize=False, channels_first=channels_first)
def __getitem__(self, idx): path = self.dataset[idx] wav, _ = apply_effects_file(str(path), EFFECTS) wav = wav.squeeze(0) length = wav.shape[0] if self.max_timestep != None: if length > self.max_timestep: start = random.randint(0, int(length - self.max_timestep)) wav = wav[start:start + self.max_timestep] tags = Path(path).parts[-3:] utterance_id = "-".join(tags).replace(".wav", "") label = self.all_speakers.index(tags[0]) return wav.numpy(), utterance_id, label
def __getitem__(self, idx): audio_path = self.data[idx] wav, _ = apply_effects_file( str(audio_path), [ ["channels", "1"], ["rate", "16000"], ["norm"], ["vad", "-T", "0.25", "-p", "0.1"], ["reverse"], ["vad", "-T", "0.25", "-p", "0.1"], ["reverse"], ["pad", "0", "3"], ], ) segments = wav.squeeze(0).unfold(0, 48000, 12000).unbind(0) return segments, len(segments), audio_path.with_suffix("").name
def test_wav(self, dtype, sample_rate, num_channels): """`apply_effects_file` works on various wav format""" channels_first = True effects = [['band', '300', '10']] input_path = self.get_temp_path('input.wav') reference_path = self.get_temp_path('reference.wav') data = get_wav_data(dtype, num_channels, channels_first=channels_first) save_wav(input_path, data, sample_rate, channels_first=channels_first) sox_utils.run_sox_effect(input_path, reference_path, effects) expected, expected_sr = load_wav(reference_path) found, sr = sox_effects.apply_effects_file( input_path, effects, normalize=False, channels_first=channels_first) assert sr == expected_sr self.assertEqual(found, expected)
def test_vorbis(self, sample_rate, num_channels): """`apply_effects_file` works on various vorbis format""" channels_first = True effects = [['band', '300', '10']] input_path = self.get_temp_path('input.vorbis') reference_path = self.get_temp_path('reference.wav') sox_utils.gen_audio_file(input_path, sample_rate, num_channels) sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32) expected, expected_sr = load_wav(reference_path) found, sr = sox_effects.apply_effects_file( input_path, effects, channels_first=channels_first) save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first) assert sr == expected_sr self.assertEqual(found, expected)
def __getitem__(self, idx): wav_name = self.wav_list[idx] wav_path = self.wav_dir / wav_name wav, _ = apply_effects_file( str(wav_path), [ ["channels", "1"], ["rate", "16000"], ["norm"], ], ) wav = wav.view(-1) wav_segments = unfold_segments(wav, self.segments_durations) system_name = wav_name.name.split("_")[0] return wav_segments, system_name
def __getitem__(self, idx): wav_name, mean, mos, judge_id = self.dataframe.loc[idx] wav_path = self.base_path / "Converted_speech_of_submitted_systems" / wav_name wav, _ = apply_effects_file( str(wav_path), [ ["channels", "1"], ["rate", "16000"], ["norm"], ], ) wav = wav.view(-1) wav_segments = unfold_segments(wav, self.segments_durations) system_name = wav_name[:3] + wav_name[-8:-4] return wav_segments, mean, system_name, mos, judge_id
def path2segment(filepath, src_dur, tgt_dur, offset): random_shift = random.uniform(0, src_dur - tgt_dur) audio_tensor, _ = apply_effects_file( str(filepath), [ ["channels", "1"], ["rate", "16000"], ["norm"], ["pad", f"{tgt_dur}", f"{tgt_dur}"], [ "trim", f"{tgt_dur + offset + random_shift}", f"{tgt_dur}", ], ], ) return audio_tensor
def __getitem__(self, idx): audio_name = self.data[idx] audio_path = ((self.dataset_root / f"{self.split}_queries" / audio_name) if idx < self.n_queries else (self.dataset_root / "Audio" / audio_name)) audio_path = audio_path.with_suffix(".wav") wav, _ = apply_effects_file( str(audio_path), [ ["channels", "1"], ["rate", "16000"], ["norm"], ["vad", "-T", "0.25", "-p", "0.1"], ["reverse"], ["vad", "-T", "0.25", "-p", "0.1"], ["reverse"], ["pad", "0", "3"], ], ) segments = wav.squeeze(0).unfold(0, 48000, 12000).unbind(0) return segments, len(segments), audio_name