def resample_folder(input_folder, output_folder, fs, regex): files = get_all_files(input_folder, match_and=[regex]) torchaudio.initialize_sox() for f in tqdm.tqdm(files): # we use sox because torchaudio.Resample uses too much RAM. resample = torchaudio.sox_effects.SoxEffectsChain() resample.append_effect_to_chain("rate", [fs]) resample.set_input_file(f) audio, fs = resample.sox_build_flow_effects() audio = (audio / torch.max(torch.abs(audio), dim=-1, keepdim=True)[0] ) # scale back otherwise you get empty .wav file os.makedirs( Path( os.path.join(output_folder, Path(f).relative_to(Path(input_folder)))).parent, exist_ok=True, ) torchaudio.save( os.path.join(output_folder, Path(f).relative_to(Path(input_folder))), audio, fs, ) torchaudio.shutdown_sox()
def main(): print('PyTorch', setup.torch_version()) print('CUDA is available:', setup.cuda_is_available()) print('CUDA device count:', setup.cuda_device_count()) directory = 'fma_small' batch_size = 8 num_workers = 8 dataset = FMA(directory) loader = setup.load(dataset, batch_size, num_workers) device = setup.device() model = setup.parallel(Example()) model.to(device) torchaudio.initialize_sox() count = 0 for batch in loader: sound, genre = batch sound.to(device) genre.to(device) count = min(count + batch_size, dataset.__len__()) print('Loaded', count, '/', dataset.__len__()) print('Done') torchaudio.shutdown_sox()
def __call__(self, wav=None, sr=None): assert len(wav.shape) == 1 _wav = None input_dtype = wav.dtype try: if random.random() < self.prob: speed_alpha = 1.0 + self.speed_limit * random.uniform(-1, 1) pitch_alpha = self.pitch_limit * random.uniform(-1, 1) * 100 # in cents # https://github.com/carlthome/python-audio-effects/blob/master/pysndfx/dsp.py#L531 with NamedTemporaryFile(suffix=".wav", dir=tempfile_dir) as temp_file: temp_filename = temp_file.name # always feed int16 to sox if wav.dtype == np.float32(): wav_int = float2int(wav) wav_write(temp_filename, sr, wav_int) else: wav_write(temp_filename, sr, wav) torchaudio.initialize_sox() effects = torchaudio.sox_effects.SoxEffectsChain() effects.append_effect_to_chain('pitch', pitch_alpha) effects.append_effect_to_chain('tempo', [speed_alpha]) effects.append_effect_to_chain('rate', sr) effects.set_input_file(temp_filename) _wav, _sr = effects.sox_build_flow_effects() torchaudio.shutdown_sox() _wav = _wav.numpy().squeeze() assert sr == _sr # always float output if _wav.dtype == np.int16(): _wav = int2float(_wav) except Exception as e: print(str(e)) if _wav is not None: return {'wav': _wav,'sr': sr} else: return {'wav': wav,'sr': sr}
def sox_build_flow_effects(self, out: Optional[Tensor] = None ) -> Tuple[Tensor, int]: r"""Build effects chain and flow effects from input file to output tensor Args: out (Tensor, optional): Where the output will be written to. (Default: ``None``) Returns: Tuple[Tensor, int]: An output Tensor of size `[C x L]` or `[L x C]` where L is the number of audio frames and C is the number of channels. An integer which is the sample rate of the audio (as listed in the metadata of the file) """ # initialize output tensor if out is not None: torchaudio.check_input(out) else: out = torch.FloatTensor() if not len(self.chain): e = SoxEffect() e.ename = "no_effects" e.eopts = [""] self.chain.append(e) # print("effect options:", [x.eopts for x in self.chain]) torchaudio.initialize_sox() import _torch_sox sr = _torch_sox.build_flow_effects(self.input_file, out, self.channels_first, self.out_siginfo, self.out_encinfo, self.filetype, self.chain, self.MAX_EFFECT_OPTS) torchaudio._audio_normalization(out, self.normalization) return out, sr
def process(self): """Process the VCC2016 data if it doesn't exist in processed_folder already.""" import zipfile if self._check_exists(): return raw_abs_dir = os.path.join(self.root, self.raw_folder) processed_abs_dir = os.path.join(self.root, self.processed_folder) dset_abs_path = os.path.join(self.root, self.raw_folder, self.dset_path) try: os.makedirs(os.path.join(self.root, self.processed_folder)) os.makedirs(os.path.join(self.root, self.raw_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise zip_path = self.zip_path print('Unzipping', zip_path) filename = zip_path.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) if not os.path.isfile(file_path): shutil.copy2(zip_path, file_path) if not os.path.exists(dset_abs_path): with zipfile.ZipFile(file_path) as zip_f: zip_f.extractall(raw_abs_dir) else: print("Using existing raw folder") if not self.dev_mode: os.unlink(file_path) # process and save as torch files torchaudio.initialize_sox() print('Processing...') shutil.copyfile(os.path.join(dset_abs_path, "README"), os.path.join(processed_abs_dir, "VCC2016_README")) audios = make_manifest(dset_abs_path) self.ids = load_ids(dset_abs_path) print("Found {} audio files".format(len(audios))) print('Extracting WORLD features.') spectras = [] aperiodicities = [] f0s = [] energies = [] labels = [] chunk_id = 0 samples = 0 self.speaker_offset_idx = {} self.chunk_indices = {} current_chunk_start_idx = 0 prev_speaker = -1 for f in audios: speaker = f.split("/", -1)[-2] spectra, aperiodicity, f0_, energy = read_audio_and_extract_features( f, trim_silence=self.trim_silence) # New speaker, save current chunk and start a fresh chunk if prev_speaker != -1 and speaker != prev_speaker: self.speaker_offset_idx[self.ids[speaker]] = samples print('Speaker {}: start idx: {}'.format(speaker, samples)) self.chunk_indices[chunk_id] = (current_chunk_start_idx, samples - 1) prev_speaker = speaker current_chunk_start_idx = samples self.save_WORLD_chunk(chunk_id, spectras, aperiodicities, f0s, energies, labels) chunk_id += 1 spectras = [] aperiodicities = [] f0s = [] energies = [] labels = [] elif prev_speaker == -1: prev_speaker = speaker # Add each spectral frame as a separate datapoint for i in range(spectra.shape[0]): sp = torch.tensor(spectra[i]).unsqueeze(0).float() ap = torch.tensor(aperiodicity[i]).unsqueeze(0).float() f0 = torch.tensor(f0_[i]).float() en = torch.tensor(energy[i]).float() spectras.append(sp) aperiodicities.append(ap) f0s.append(f0) energies.append(en) labels.append(self.ids[speaker]) samples += 1 if len(spectras) > 0: self.chunk_indices[chunk_id] = (current_chunk_start_idx, samples - 1) self.save_WORLD_chunk(chunk_id, spectras, aperiodicities, f0s, energies, labels) self._write_info(samples) # Compute each speaker statistics and add to the info file self.extract_dataset_max_min_and_speaker_profiles() if not self.dev_mode: shutil.rmtree(raw_abs_dir, ignore_errors=True) torchaudio.shutdown_sox() print('Done!')
def download(self): """Download the VCTK data if it doesn't exist in processed_folder already.""" from six.moves import urllib import tarfile if self._check_exists(): return raw_abs_dir = os.path.join(self.root, self.raw_folder) processed_abs_dir = os.path.join(self.root, self.processed_folder) dset_abs_path = os.path.join(self.root, self.raw_folder, self.dset_path) # download files try: os.makedirs(os.path.join(self.root, self.processed_folder)) os.makedirs(os.path.join(self.root, self.raw_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise url = self.url print('Downloading ' + url) filename = url.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) if not os.path.isfile(file_path): urllib.request.urlretrieve(url, file_path) if not os.path.exists(dset_abs_path): with tarfile.open(file_path) as zip_f: zip_f.extractall(raw_abs_dir) else: print("Using existing raw folder") if not self.dev_mode: os.unlink(file_path) # process and save as torch files torchaudio.initialize_sox() print('Processing...') shutil.copyfile(os.path.join(dset_abs_path, "COPYING"), os.path.join(processed_abs_dir, "VCTK_COPYING")) audios = make_manifest(dset_abs_path) utterences = load_txts(dset_abs_path) self.max_len = 0 print("Found {} audio files and {} utterences".format( len(audios), len(utterences))) for n in range(len(audios) // self.chunk_size + 1): tensors = [] labels = [] lengths = [] st_idx = n * self.chunk_size end_idx = st_idx + self.chunk_size for i, f in enumerate(audios[st_idx:end_idx]): txt_dir = os.path.dirname(f).replace("wav48", "txt") if os.path.exists(txt_dir): f_rel_no_ext = os.path.basename(f).rsplit(".", 1)[0] sig = read_audio(f, downsample=self.downsample)[0] tensors.append(sig) lengths.append(sig.size(1)) labels.append(utterences[f_rel_no_ext]) self.max_len = sig.size( 1) if sig.size(1) > self.max_len else self.max_len # sort sigs/labels: longest -> shortest tensors, labels = zip( *[(b, c) for (a, b, c) in sorted(zip(lengths, tensors, labels), key=lambda x: x[0], reverse=True)]) data = (tensors, labels) torch.save( data, os.path.join(self.root, self.processed_folder, "vctk_{:04d}.pt".format(n))) self._write_info((n * self.chunk_size) + i + 1) if not self.dev_mode: shutil.rmtree(raw_abs_dir, ignore_errors=True) torchaudio.shutdown_sox() print('Done!')
def setUpClass(cls): torchaudio.initialize_sox()
def process(self): """Process the VCTK data if it doesn't exist in processed_folder already.""" import zipfile if self._check_exists(): return raw_abs_dir = os.path.join(self.root, self.raw_folder) processed_abs_dir = os.path.join(self.root, self.processed_folder) dset_abs_path = os.path.join(self.root, self.raw_folder, self.dset_path) try: os.makedirs(os.path.join(self.root, self.processed_folder)) os.makedirs(os.path.join(self.root, self.raw_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise zip_path = self.zip_path print('Unzipping', zip_path) filename = zip_path.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) if not os.path.isfile(file_path): shutil.copy2(zip_path, file_path) if not os.path.exists(dset_abs_path): with zipfile.ZipFile(file_path) as zip_f: zip_f.extractall(raw_abs_dir) else: print("Using existing raw folder") if not self.dev_mode: os.unlink(file_path) # process and save as torch files torchaudio.initialize_sox() print('Processing...') shutil.copyfile(os.path.join(dset_abs_path, "COPYING"), os.path.join(processed_abs_dir, "VCTK_COPYING")) audios = make_manifest(dset_abs_path, self.shuffle_order) self.ids = load_ids(dset_abs_path) self.max_len = 0 all_lengths = [] print("Found {} audio files".format(len(audios))) for n in range(len(audios) // self.chunk_size + 1): tensors = [] labels = [] lengths = [] st_idx = n * self.chunk_size end_idx = st_idx + self.chunk_size for i, f in enumerate(audios[st_idx:end_idx]): f_rel_no_ext = os.path.basename(f).rsplit(".", 1)[0] sig = read_audio(f, downsample=self.downsample, trim_silence=self.trim_silence)[0] tensors.append(sig) lengths.append(sig.size(1)) labels.append(self.ids[f_rel_no_ext.split('_')[0]]) self.max_len = sig.size( 1) if sig.size(1) > self.max_len else self.max_len all_lengths.append(sig.size(1)) # sort sigs/labels: longest -> shortest tensors, labels = zip( *[(b, c) for (a, b, c) in sorted(zip(lengths, tensors, labels), key=lambda x: x[0], reverse=True)]) data = (tensors, labels) torch.save( data, os.path.join(self.root, self.processed_folder, "vctk_{:04d}.pt".format(n))) self.mean_len = np.mean(all_lengths) self.std_len = np.std(all_lengths, ddof=1) self._write_info((n * self.chunk_size) + i + 1) if not self.dev_mode: shutil.rmtree(raw_abs_dir, ignore_errors=True) torchaudio.shutdown_sox() print('Done!')
def __call__(self, audio_path, sample_rate, normalize=True, defaults=dict(pitch=[-300, 300], tempo=[0.8, 1.2], gain=[-10, 10]), tmpdir='/dev/shm'): effect = None tuple_if_str = lambda t: (t, defaults.get(t)) if isinstance(t, str ) else t if self.transforms and random.random() < self.prob: transform = tuple_if_str(random.choice(self.transforms)) effect = ( [transform[0], fixed_or_choice(transform[1])] if transform[0] in defaults else transform[0]) if isinstance( transform, tuple) else [] tmp_audio_path = [] if effect and isinstance(effect, str) and effect.startswith('transcode'): codec = effect.split('_')[1] tmp_audio_path = [ tempfile.mkstemp(suffix='.' + codec, dir=tmpdir)[1], tempfile.mkstemp(suffix='.wav', dir=tmpdir)[1] ] subprocess.check_call([ 'sox', '-V0', audio_path, '-t', codec, '-r', str(sample_rate), tmp_audio_path[0] ]) if self.bug == 'SoxEffectsChain': subprocess.check_call([ 'sox', '-V0', tmp_audio_path[0], '-t', 'wav', tmp_audio_path[1] ]) audio_path = tmp_audio_path[1] else: audio_path = tmp_audio_path[0] effect = None if self.bug == 'SoxEffectsChain': torchaudio.initialize_sox() sox = torchaudio.sox_effects.SoxEffectsChain() if effect: sox.append_effect_to_chain(*effect) sox.append_effect_to_chain('channels', 1) sox.append_effect_to_chain('rate', sample_rate) sox.set_input_file(audio_path) signal, sample_rate_ = sox.sox_build_flow_effects() signal = signal[0] sox.clear_chain() torchaudio.shutdown_sox() elif self.bug == 'as_tensor': signal, sample_rate_ = torch.as_tensor( bytearray( subprocess.check_output([ 'sox', '-V0', audio_path, '-b', '16', '-e', 'signed', '--endian', 'little', '-r', str(sample_rate), '-c', '1', '-t', 'raw', '-' ] + ([effect[0], str(effect[1])] if effect else []))), dtype=torch.int16), sample_rate else: signal, sample_rate_ = torch.from_numpy( np.frombuffer(subprocess.check_output([ 'sox', '-V0', audio_path, '-b', '16', '-e', 'signed', '--endian', 'little', '-r', str(sample_rate), '-c', '1', '-t', 'raw', '-' ] + ([effect[0], str(effect[1])] if effect else [])), dtype=np.int16)).to(torch.float32), sample_rate for audio_path in tmp_audio_path: os.remove(audio_path) if sample_rate is not None and sample_rate_ != sample_rate: signal, sample_rate_ = dataset.resample(signal, sample_rate_, sample_rate) if normalize: signal = models.normalize_signal(signal) if effect == []: signal, sample_rate = transform(signal, sample_rate) return signal, sample_rate
def initialize_sox(): """Initialize sox backend only if it has not yet.""" global _IS_SOX_INITIALIZED if not _IS_SOX_INITIALIZED: torchaudio.initialize_sox() _IS_SOX_INITIALIZED = True
def process(self): """Process the VCC2016 data if it doesn't exist in processed_folder already.""" import zipfile if self._check_exists(): return raw_abs_dir = os.path.join(self.root, self.raw_folder) processed_abs_dir = os.path.join(self.root, self.processed_folder) dset_abs_path = os.path.join(self.root, self.raw_folder, self.dset_path) try: os.makedirs(os.path.join(self.root, self.processed_folder)) os.makedirs(os.path.join(self.root, self.raw_folder)) except OSError as e: if e.errno == errno.EEXIST: pass else: raise zip_path = self.zip_path print('Unzipping', zip_path) filename = zip_path.rpartition('/')[2] file_path = os.path.join(self.root, self.raw_folder, filename) if not os.path.isfile(file_path): shutil.copy2(zip_path, file_path) if not os.path.exists(dset_abs_path): with zipfile.ZipFile(file_path) as zip_f: zip_f.extractall(raw_abs_dir) else: print("Using existing raw folder") if not self.dev_mode: os.unlink(file_path) # process and save as torch files torchaudio.initialize_sox() print('Processing...') shutil.copyfile(os.path.join(dset_abs_path, "README"), os.path.join(processed_abs_dir, "VCC2016_README")) audios = make_manifest(dset_abs_path) self.ids = load_ids(dset_abs_path) print("Found {} audio files".format(len(audios))) print('Splitting samples to length {}'.format(self.sample_length)) tensors = [] labels = [] chunk_id = 0 samples = 0 self.speaker_offset_idx = {} prev_speaker = -1 for f in audios: speaker = f.split("/", -1)[-2] sig, _ = read_audio(f, trim_silence=self.trim_silence) # New speaker, save current chunk and start a fresh chunk if prev_speaker == -1 or speaker != prev_speaker: self.speaker_offset_idx[self.ids[speaker]] = samples print('Speaker {}: start idx: {}'.format(speaker, samples)) prev_speaker = speaker length = sig.size(1) # Cut the end of the sample if its too long to be equally split. if length % self.sample_length > 0: sig = sig[:, :length - (length % self.sample_length)] # Split samples sigs = sig.view(-1, self.sample_length) for sig in sigs: sig = sig.unsqueeze(0) tensors.append(sig) labels.append(self.ids[speaker]) self.max_len = sig.size( 1) if sig.size(1) > self.max_len else self.max_len samples += 1 # Save to chunk-file # if len(tensors) == self.chunk_size: # self.save_chunk(chunk_id, lengths, tensors, labels) # chunk_id += 1 # tensors = [] # labels = [] # Save all to one chunk-file if len(tensors) > 0: self.save_raw_chunk(chunk_id, tensors, labels) self._write_info(samples) if not self.dev_mode: shutil.rmtree(raw_abs_dir, ignore_errors=True) torchaudio.shutdown_sox() print('Done!')
parser.add_argument('--backend', default='nccl', type=str) parser.add_argument('--init-method', default='env://', type=str) parser.add_argument('--local-rank', '--local_rank', '--gpu', default=0, type=int) parser.add_argument('--sync-bn', action='store_true', default=False) # F16 training parser.add_argument('--opt-level', default='O0', type=str, choices=['O0', 'O1', 'O2', 'O3']) parser.add_argument('--keep-batchnorm-fp32', default=None, action='store_true') parser.add_argument('--loss-scale', type=str, default=None) parser.add_argument('--verbosity', '-v', action='count', default=0) args = parser.parse_args() # Initialize sox torchaudio.initialize_sox() args.world_size = 1 # Pin GPU to be used to process local rank (one GPU per process) torch.cuda.set_device(args.local_rank) if args.deterministic: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed: